├── README.md └── lambda_functions └── pdf_text_extract ├── _markerlib ├── __init__.py ├── __init__.pyc ├── markers.py └── markers.pyc ├── distribute-0.7.3-py2.7.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── installed-files.txt ├── requires.txt ├── top_level.txt └── zip-safe ├── easy_install.py ├── easy_install.pyc ├── elasticsearch-2.2.0.dist-info ├── DESCRIPTION.rst ├── METADATA ├── RECORD ├── WHEEL ├── metadata.json ├── pbr.json └── top_level.txt ├── elasticsearch ├── __init__.py ├── __init__.pyc ├── client │ ├── __init__.py │ ├── __init__.pyc │ ├── cat.py │ ├── cat.pyc │ ├── cluster.py │ ├── cluster.pyc │ ├── indices.py │ ├── indices.pyc │ ├── nodes.py │ ├── nodes.pyc │ ├── snapshot.py │ ├── snapshot.pyc │ ├── utils.py │ └── utils.pyc ├── compat.py ├── compat.pyc ├── connection │ ├── __init__.py │ ├── __init__.pyc │ ├── base.py │ ├── base.pyc │ ├── esthrift │ │ ├── Rest.py │ │ ├── Rest.pyc │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── constants.py │ │ ├── constants.pyc │ │ ├── ttypes.py │ │ └── ttypes.pyc │ ├── http_requests.py │ ├── http_requests.pyc │ ├── http_urllib3.py │ ├── http_urllib3.pyc │ ├── memcached.py │ ├── memcached.pyc │ ├── pooling.py │ ├── pooling.pyc │ ├── thrift.py │ └── thrift.pyc ├── connection_pool.py ├── connection_pool.pyc ├── exceptions.py ├── exceptions.pyc ├── helpers │ ├── __init__.py │ ├── __init__.pyc │ ├── test.py │ └── test.pyc ├── serializer.py ├── serializer.pyc ├── transport.py └── transport.pyc ├── lambda_function.py ├── lambda_function.zip ├── pdfminer-20110515-py2.7.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── installed-files.txt └── top_level.txt ├── pdfminer ├── __init__.py ├── __init__.pyc ├── arcfour.py ├── arcfour.pyc ├── ascii85.py ├── ascii85.pyc ├── cmap │ ├── __init__.py │ └── __init__.pyc ├── cmapdb.py ├── cmapdb.pyc ├── converter.py ├── converter.pyc ├── encodingdb.py ├── encodingdb.pyc ├── fontmetrics.py ├── fontmetrics.pyc ├── glyphlist.py ├── glyphlist.pyc ├── latin_enc.py ├── latin_enc.pyc ├── layout.py ├── layout.pyc ├── lzw.py ├── lzw.pyc ├── pdfcolor.py ├── pdfcolor.pyc ├── pdfdevice.py ├── pdfdevice.pyc ├── pdffont.py ├── pdffont.pyc ├── pdfinterp.py ├── pdfinterp.pyc ├── pdfparser.py ├── pdfparser.pyc ├── pdftypes.py ├── pdftypes.pyc ├── psparser.py ├── psparser.pyc ├── rijndael.py ├── rijndael.pyc ├── runlength.py ├── runlength.pyc ├── utils.py └── utils.pyc ├── pkg_resources ├── __init__.py ├── __init__.pyc └── _vendor │ ├── __init__.py │ ├── __init__.pyc │ └── packaging │ ├── __about__.py │ ├── __about__.pyc │ ├── __init__.py │ ├── __init__.pyc │ ├── _compat.py │ ├── _compat.pyc │ ├── _structures.py │ ├── _structures.pyc │ ├── specifiers.py │ ├── specifiers.pyc │ ├── version.py │ └── version.pyc ├── setuptools-19.2.dist-info ├── DESCRIPTION.rst ├── METADATA ├── RECORD ├── WHEEL ├── dependency_links.txt ├── entry_points.txt ├── metadata.json ├── top_level.txt └── zip-safe ├── setuptools ├── __init__.py ├── __init__.pyc ├── archive_util.py ├── archive_util.pyc ├── cli-32.exe ├── cli-64.exe ├── cli-arm-32.exe ├── cli.exe ├── command │ ├── __init__.py │ ├── __init__.pyc │ ├── alias.py │ ├── alias.pyc │ ├── bdist_egg.py │ ├── bdist_egg.pyc │ ├── bdist_rpm.py │ ├── bdist_rpm.pyc │ ├── bdist_wininst.py │ ├── bdist_wininst.pyc │ ├── build_ext.py │ ├── build_ext.pyc │ ├── build_py.py │ ├── build_py.pyc │ ├── develop.py │ ├── develop.pyc │ ├── easy_install.py │ ├── easy_install.pyc │ ├── egg_info.py │ ├── egg_info.pyc │ ├── install.py │ ├── install.pyc │ ├── install_egg_info.py │ ├── install_egg_info.pyc │ ├── install_lib.py │ ├── install_lib.pyc │ ├── install_scripts.py │ ├── install_scripts.pyc │ ├── launcher manifest.xml │ ├── register.py │ ├── register.pyc │ ├── rotate.py │ ├── rotate.pyc │ ├── saveopts.py │ ├── saveopts.pyc │ ├── sdist.py │ ├── sdist.pyc │ ├── setopt.py │ ├── setopt.pyc │ ├── test.py │ ├── test.pyc │ ├── upload_docs.py │ └── upload_docs.pyc ├── compat.py ├── compat.pyc ├── depends.py ├── depends.pyc ├── dist.py ├── dist.pyc ├── extension.py ├── extension.pyc ├── gui-32.exe ├── gui-64.exe ├── gui-arm-32.exe ├── gui.exe ├── lib2to3_ex.py ├── lib2to3_ex.pyc ├── msvc9_support.py ├── msvc9_support.pyc ├── package_index.py ├── package_index.pyc ├── py26compat.py ├── py26compat.pyc ├── py27compat.py ├── py27compat.pyc ├── py31compat.py ├── py31compat.pyc ├── sandbox.py ├── sandbox.pyc ├── script (dev).tmpl ├── script.tmpl ├── site-patch.py ├── site-patch.pyc ├── ssl_support.py ├── ssl_support.pyc ├── unicode_utils.py ├── unicode_utils.pyc ├── utils.py ├── utils.pyc ├── version.py ├── version.pyc ├── windows_support.py └── windows_support.pyc ├── slate-0.3-py2.7.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── installed-files.txt ├── requires.txt └── top_level.txt ├── slate ├── __init__.py ├── __init__.pyc ├── conftest.py ├── conftest.pyc ├── slate.py ├── slate.pyc ├── test_slate.py ├── test_slate.pyc ├── utils.py └── utils.pyc ├── urllib3-1.14.dist-info ├── DESCRIPTION.rst ├── METADATA ├── RECORD ├── WHEEL ├── metadata.json ├── pbr.json └── top_level.txt └── urllib3 ├── __init__.py ├── __init__.pyc ├── _collections.py ├── _collections.pyc ├── connection.py ├── connection.pyc ├── connectionpool.py ├── connectionpool.pyc ├── contrib ├── __init__.py ├── __init__.pyc ├── appengine.py ├── appengine.pyc ├── ntlmpool.py ├── ntlmpool.pyc ├── pyopenssl.py ├── pyopenssl.pyc ├── socks.py └── socks.pyc ├── exceptions.py ├── exceptions.pyc ├── fields.py ├── fields.pyc ├── filepost.py ├── filepost.pyc ├── packages ├── __init__.py ├── __init__.pyc ├── ordered_dict.py ├── ordered_dict.pyc ├── six.py ├── six.pyc └── ssl_match_hostname │ ├── __init__.py │ ├── __init__.pyc │ ├── _implementation.py │ └── _implementation.pyc ├── poolmanager.py ├── poolmanager.pyc ├── request.py ├── request.pyc ├── response.py ├── response.pyc └── util ├── __init__.py ├── __init__.pyc ├── connection.py ├── connection.pyc ├── request.py ├── request.pyc ├── response.py ├── response.pyc ├── retry.py ├── retry.pyc ├── ssl_.py ├── ssl_.pyc ├── timeout.py ├── timeout.pyc ├── url.py └── url.pyc /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This code, written to be executed as an AWS Lambda function, uses the Slate module to extract the text from a PDF file, and then indexes that text to an ElasticSearch cluster. It is designed to be invoked when a PDF document is put to an S3 bucket. 4 | 5 | 6 | A few implementation notes: 7 | * Because this is just a simple PoC, the only text data index to Elasticsearch is on the first page 8 | * Play around with the Lambda timeout time to set something that works for document sizes you're placing in the S3 bucket 9 | * For smaller PDF docs, I've observed memory utilization (in CWL) of low 10s of Mbytes 10 | * This assumes some familiarity with AWS Lambda basics (configuring events sources, invocation policies, etc) 11 | * Specify a suffix of 'pdf' to make sure it's only executing for pdf files 12 | 13 | To be implemented: 14 | * Signing of POSTs to Elasticsearch endpoints using SigV4, instead of using python modules -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/_markerlib/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import ast 3 | from _markerlib.markers import default_environment, compile, interpret 4 | except ImportError: 5 | if 'ast' in globals(): 6 | raise 7 | def default_environment(): 8 | return {} 9 | def compile(marker): 10 | def marker_fn(environment=None, override=None): 11 | # 'empty markers are True' heuristic won't install extra deps. 12 | return not marker.strip() 13 | marker_fn.__doc__ = marker 14 | return marker_fn 15 | def interpret(marker, environment=None, override=None): 16 | return compile(marker)() 17 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/_markerlib/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/_markerlib/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/_markerlib/markers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Interpret PEP 345 environment markers. 3 | 4 | EXPR [in|==|!=|not in] EXPR [or|and] ... 5 | 6 | where EXPR belongs to any of those: 7 | 8 | python_version = '%s.%s' % (sys.version_info[0], sys.version_info[1]) 9 | python_full_version = sys.version.split()[0] 10 | os.name = os.name 11 | sys.platform = sys.platform 12 | platform.version = platform.version() 13 | platform.machine = platform.machine() 14 | platform.python_implementation = platform.python_implementation() 15 | a free string, like '2.6', or 'win32' 16 | """ 17 | 18 | __all__ = ['default_environment', 'compile', 'interpret'] 19 | 20 | import ast 21 | import os 22 | import platform 23 | import sys 24 | import weakref 25 | 26 | _builtin_compile = compile 27 | 28 | try: 29 | from platform import python_implementation 30 | except ImportError: 31 | if os.name == "java": 32 | # Jython 2.5 has ast module, but not platform.python_implementation() function. 33 | def python_implementation(): 34 | return "Jython" 35 | else: 36 | raise 37 | 38 | 39 | # restricted set of variables 40 | _VARS = {'sys.platform': sys.platform, 41 | 'python_version': '%s.%s' % sys.version_info[:2], 42 | # FIXME parsing sys.platform is not reliable, but there is no other 43 | # way to get e.g. 2.7.2+, and the PEP is defined with sys.version 44 | 'python_full_version': sys.version.split(' ', 1)[0], 45 | 'os.name': os.name, 46 | 'platform.version': platform.version(), 47 | 'platform.machine': platform.machine(), 48 | 'platform.python_implementation': python_implementation(), 49 | 'extra': None # wheel extension 50 | } 51 | 52 | for var in list(_VARS.keys()): 53 | if '.' in var: 54 | _VARS[var.replace('.', '_')] = _VARS[var] 55 | 56 | def default_environment(): 57 | """Return copy of default PEP 385 globals dictionary.""" 58 | return dict(_VARS) 59 | 60 | class ASTWhitelist(ast.NodeTransformer): 61 | def __init__(self, statement): 62 | self.statement = statement # for error messages 63 | 64 | ALLOWED = (ast.Compare, ast.BoolOp, ast.Attribute, ast.Name, ast.Load, ast.Str) 65 | # Bool operations 66 | ALLOWED += (ast.And, ast.Or) 67 | # Comparison operations 68 | ALLOWED += (ast.Eq, ast.Gt, ast.GtE, ast.In, ast.Is, ast.IsNot, ast.Lt, ast.LtE, ast.NotEq, ast.NotIn) 69 | 70 | def visit(self, node): 71 | """Ensure statement only contains allowed nodes.""" 72 | if not isinstance(node, self.ALLOWED): 73 | raise SyntaxError('Not allowed in environment markers.\n%s\n%s' % 74 | (self.statement, 75 | (' ' * node.col_offset) + '^')) 76 | return ast.NodeTransformer.visit(self, node) 77 | 78 | def visit_Attribute(self, node): 79 | """Flatten one level of attribute access.""" 80 | new_node = ast.Name("%s.%s" % (node.value.id, node.attr), node.ctx) 81 | return ast.copy_location(new_node, node) 82 | 83 | def parse_marker(marker): 84 | tree = ast.parse(marker, mode='eval') 85 | new_tree = ASTWhitelist(marker).generic_visit(tree) 86 | return new_tree 87 | 88 | def compile_marker(parsed_marker): 89 | return _builtin_compile(parsed_marker, '', 'eval', 90 | dont_inherit=True) 91 | 92 | _cache = weakref.WeakValueDictionary() 93 | 94 | def compile(marker): 95 | """Return compiled marker as a function accepting an environment dict.""" 96 | try: 97 | return _cache[marker] 98 | except KeyError: 99 | pass 100 | if not marker.strip(): 101 | def marker_fn(environment=None, override=None): 102 | """""" 103 | return True 104 | else: 105 | compiled_marker = compile_marker(parse_marker(marker)) 106 | def marker_fn(environment=None, override=None): 107 | """override updates environment""" 108 | if override is None: 109 | override = {} 110 | if environment is None: 111 | environment = default_environment() 112 | environment.update(override) 113 | return eval(compiled_marker, environment) 114 | marker_fn.__doc__ = marker 115 | _cache[marker] = marker_fn 116 | return _cache[marker] 117 | 118 | def interpret(marker, environment=None): 119 | return compile(marker)(environment) 120 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/_markerlib/markers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/_markerlib/markers.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: distribute 3 | Version: 0.7.3 4 | Summary: distribute legacy wrapper 5 | Home-page: http://packages.python.org/distribute 6 | Author: The fellowship of the packaging 7 | Author-email: distutils-sig@python.org 8 | License: PSF or ZPL 9 | Description: Distribute - legacy package 10 | 11 | This package is a simple compatibility layer that installs Setuptools 0.7+. 12 | 13 | Keywords: CPAN PyPI distutils eggs package management 14 | Platform: UNKNOWN 15 | Classifier: Development Status :: 5 - Production/Stable 16 | Classifier: Intended Audience :: Developers 17 | Classifier: License :: OSI Approved :: Python Software Foundation License 18 | Classifier: License :: OSI Approved :: Zope Public License 19 | Classifier: Operating System :: OS Independent 20 | Classifier: Programming Language :: Python :: 2.4 21 | Classifier: Programming Language :: Python :: 2.5 22 | Classifier: Programming Language :: Python :: 2.6 23 | Classifier: Programming Language :: Python :: 2.7 24 | Classifier: Programming Language :: Python :: 3 25 | Classifier: Programming Language :: Python :: 3.1 26 | Classifier: Programming Language :: Python :: 3.2 27 | Classifier: Programming Language :: Python :: 3.3 28 | Classifier: Topic :: Software Development :: Libraries :: Python Modules 29 | Classifier: Topic :: System :: Archiving :: Packaging 30 | Classifier: Topic :: System :: Systems Administration 31 | Classifier: Topic :: Utilities 32 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | MANIFEST.in 2 | pkg_resources.py 3 | setup.cfg 4 | setup.py 5 | distribute.egg-info/PKG-INFO 6 | distribute.egg-info/SOURCES.txt 7 | distribute.egg-info/dependency_links.txt 8 | distribute.egg-info/requires.txt 9 | distribute.egg-info/top_level.txt 10 | distribute.egg-info/zip-safe 11 | setuptools/__init__.py 12 | setuptools/archive_util.py 13 | setuptools/compat.py 14 | setuptools/depends.py 15 | setuptools/dist.py 16 | setuptools/extension.py 17 | setuptools/package_index.py 18 | setuptools/py24compat.py 19 | setuptools/py27compat.py 20 | setuptools/sandbox.py 21 | setuptools/script template (dev).py 22 | setuptools/script template.py 23 | setuptools/site-patch.py 24 | setuptools/ssl_support.py 25 | setuptools.egg-info/PKG-INFO 26 | setuptools.egg-info/SOURCES.txt 27 | setuptools.egg-info/dependency_links.txt 28 | setuptools.egg-info/entry_points.txt 29 | setuptools.egg-info/entry_points.txt.orig 30 | setuptools.egg-info/requires.txt 31 | setuptools.egg-info/requires.txt.orig 32 | setuptools.egg-info/top_level.txt 33 | setuptools.egg-info/zip-safe 34 | setuptools/command/__init__.py 35 | setuptools/command/alias.py 36 | setuptools/command/bdist_egg.py 37 | setuptools/command/bdist_rpm.py 38 | setuptools/command/bdist_wininst.py 39 | setuptools/command/build_ext.py 40 | setuptools/command/build_py.py 41 | setuptools/command/develop.py 42 | setuptools/command/easy_install.py 43 | setuptools/command/egg_info.py 44 | setuptools/command/install.py 45 | setuptools/command/install_egg_info.py 46 | setuptools/command/install_lib.py 47 | setuptools/command/install_scripts.py 48 | setuptools/command/launcher manifest.xml 49 | setuptools/command/register.py 50 | setuptools/command/rotate.py 51 | setuptools/command/saveopts.py 52 | setuptools/command/sdist.py 53 | setuptools/command/setopt.py 54 | setuptools/command/test.py 55 | setuptools/command/upload.py 56 | setuptools/command/upload_docs.py -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/installed-files.txt: -------------------------------------------------------------------------------- 1 | ./ 2 | dependency_links.txt 3 | PKG-INFO 4 | requires.txt 5 | SOURCES.txt 6 | top_level.txt 7 | zip-safe 8 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | setuptools>=0.7 -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/distribute-0.7.3-py2.7.egg-info/zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/easy_install.py: -------------------------------------------------------------------------------- 1 | """Run the EasyInstall command""" 2 | 3 | if __name__ == '__main__': 4 | from setuptools.command.easy_install import main 5 | main() 6 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/easy_install.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/easy_install.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch-2.2.0.dist-info/DESCRIPTION.rst: -------------------------------------------------------------------------------- 1 | Python Elasticsearch Client 2 | =========================== 3 | 4 | Official low-level client for Elasticsearch. Its goal is to provide common 5 | ground for all Elasticsearch-related code in Python; because of this it tries 6 | to be opinion-free and very extendable. 7 | 8 | For a more high level client library with more limited scope, have a look at 9 | `elasticsearch-dsl`_ - a more pythonic library sitting on top of 10 | ``elasticsearch-py``. 11 | 12 | It provides a more convenient and idiomatic way to write and manipulate 13 | `queries`_. It stays close to the Elasticsearch JSON DSL, mirroring its 14 | terminology and structure while exposing the whole range of the DSL from Python 15 | either directly using defined classes or a queryset-like expressions. 16 | 17 | It also provides an optional `persistence layer`_ for working with documents as 18 | Python objects in an ORM-like fashion: defining mappings, retrieving and saving 19 | documents, wrapping the document data in user-defined classes. 20 | 21 | .. _elasticsearch-dsl: http://elasticsearch-dsl.rtfd.org/ 22 | .. _queries: http://elasticsearch-dsl.readthedocs.org/en/latest/search_dsl.html 23 | .. _persistence layer: http://elasticsearch-dsl.readthedocs.org/en/latest/persistence.html#doctype 24 | 25 | Compatibility 26 | ------------- 27 | 28 | The library is compatible with all Elasticsearch versions since ``0.90.x`` but you 29 | **have to use a matching major version**: 30 | 31 | For **Elasticsearch 2.0** and later, use the major version 2 (``2.x.y``) of the 32 | library. 33 | 34 | For **Elasticsearch 1.0** and later, use the major version 1 (``1.x.y``) of the 35 | library. 36 | 37 | For **Elasticsearch 0.90.x**, use a version from ``0.4.x`` releases of the 38 | library. 39 | 40 | The recommended way to set your requirements in your `setup.py` or 41 | `requirements.txt` is:: 42 | 43 | # Elasticsearch 2.x 44 | elasticsearch>=2.0.0,<3.0.0 45 | 46 | # Elasticsearch 1.x 47 | elasticsearch>=1.0.0,<2.0.0 48 | 49 | # Elasticsearch 0.90.x 50 | elasticsearch<1.0.0 51 | 52 | The development is happening on ``master`` and ``1.x`` branches, respectively. 53 | 54 | Installation 55 | ------------ 56 | 57 | Install the ``elasticsearch`` package with `pip 58 | `_:: 59 | 60 | pip install elasticsearch 61 | 62 | 63 | Example use 64 | ----------- 65 | 66 | Simple use-case:: 67 | 68 | >>> from datetime import datetime 69 | >>> from elasticsearch import Elasticsearch 70 | 71 | # by default we connect to localhost:9200 72 | >>> es = Elasticsearch() 73 | 74 | # create an index in elasticsearch, ignore status code 400 (index already exists) 75 | >>> es.indices.create(index='my-index', ignore=400) 76 | {u'acknowledged': True} 77 | 78 | # datetimes will be serialized 79 | >>> es.index(index="my-index", doc_type="test-type", id=42, body={"any": "data", "timestamp": datetime.now()}) 80 | {u'_id': u'42', u'_index': u'my-index', u'_type': u'test-type', u'_version': 1, u'ok': True} 81 | 82 | # but not deserialized 83 | >>> es.get(index="my-index", doc_type="test-type", id=42)['_source'] 84 | {u'any': u'data', u'timestamp': u'2013-05-12T19:45:31.804229'} 85 | 86 | `Full documentation`_. 87 | 88 | .. _Full documentation: http://elasticsearch-py.rtfd.org/ 89 | 90 | 91 | Features 92 | -------- 93 | 94 | The client's features include: 95 | 96 | * translating basic Python data types to and from json (datetimes are not 97 | decoded for performance reasons) 98 | * configurable automatic discovery of cluster nodes 99 | * persistent connections 100 | * load balancing (with pluggable selection strategy) across all available nodes 101 | * failed connection penalization (time based - failed connections won't be 102 | retried until a timeout is reached) 103 | * support for ssl and http authentication 104 | * thread safety 105 | * pluggable architecture 106 | 107 | 108 | License 109 | ------- 110 | 111 | Copyright 2015 Elasticsearch 112 | 113 | Licensed under the Apache License, Version 2.0 (the "License"); 114 | you may not use this file except in compliance with the License. 115 | You may obtain a copy of the License at 116 | 117 | http://www.apache.org/licenses/LICENSE-2.0 118 | 119 | Unless required by applicable law or agreed to in writing, software 120 | distributed under the License is distributed on an "AS IS" BASIS, 121 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 122 | See the License for the specific language governing permissions and 123 | limitations under the License. 124 | 125 | Build status 126 | ------------ 127 | 128 | .. image:: https://secure.travis-ci.org/elastic/elasticsearch-py.png 129 | :target: http://travis-ci.org/#!/elastic/elasticsearch-py 130 | 131 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch-2.2.0.dist-info/RECORD: -------------------------------------------------------------------------------- 1 | elasticsearch/compat.py,sha256=MLDabdJN3w5KkeVykvh0rpUO1SFFds0mBnDWZJ30SdI,312 2 | elasticsearch/transport.py,sha256=El6Li6eHmkFZxeLMfWiZb-nf-zBnsXBwQR8KRamiAiU,15277 3 | elasticsearch/exceptions.py,sha256=RGOIdnNlpKYB9-Hy-gyMIALNq3CxOdzPu4bySZN97q8,3127 4 | elasticsearch/connection_pool.py,sha256=re2lYiUtyO3CSzkJLZc8L2ooVZ76bH7rKm1oQ_5SG9Q,9607 5 | elasticsearch/serializer.py,sha256=qqyY9_JaaPuTw1UAbL99CSKllbvXOHF0VfCXcupM4sk,2287 6 | elasticsearch/__init__.py,sha256=QLTdsCcV3Hy1XLmKE6qVe8BBGGMajdpRU4FWhQTmXzc,837 7 | elasticsearch/client/cluster.py,sha256=ez_hB0Ywkn8s1QSh-Sdolr9EEaKXPiVQ1T8CU1v0udQ,7619 8 | elasticsearch/client/cat.py,sha256=2egI0ak1z9u8ruR_rp9BqQxLOpDltt4dgL1o99QnN_Q,15962 9 | elasticsearch/client/snapshot.py,sha256=w-cteBMUMYV0DOU1oJXFqRnhfvGjb0dJreSbj2Uy6Vs,7609 10 | elasticsearch/client/indices.py,sha256=juLLjjOtNo0BL6DzMcyXS2Ch_PrwL24M89Ut7XewezM,52688 11 | elasticsearch/client/utils.py,sha256=O9ELpuzQIShFON8UBhTo87T4vzQTQwHga8oVcskiVOk,2645 12 | elasticsearch/client/__init__.py,sha256=q4_DfE7ImodKSbvdndnYM8xNCP2lCCSeOX-zb6vhu5U,63969 13 | elasticsearch/client/nodes.py,sha256=P8YUL-BppKfn4tdw3gL5hEbne5nw5hzu_oivmQVXd0I,4805 14 | elasticsearch/connection/memcached.py,sha256=R8fqOEuTypsph4R0dD0GDsGz4s6vcQXEQTERZ3lRvEg,2878 15 | elasticsearch/connection/thrift.py,sha256=wOwxDADn03OTK06Dc9Uhgj8BGhwXcy-eAvYWRp7fGps,3872 16 | elasticsearch/connection/pooling.py,sha256=19x-kzyqndWJnnMGZ1ucoZvpMIliH3q3oJLe34OEROA,798 17 | elasticsearch/connection/http_urllib3.py,sha256=62rcfrXs2ygZxEC2j7j1xbvtwBmYKvamFl9sBfYFXzo,5029 18 | elasticsearch/connection/base.py,sha256=g4CNB4wa5ztJpGTEcx-k--KOU0I6h1q0ln4n6qZX4ik,4265 19 | elasticsearch/connection/http_requests.py,sha256=oBGf1xLPE4MInMaT5H2dJY14_aTh9y18eNmXqNsCkYY,3659 20 | elasticsearch/connection/__init__.py,sha256=V50xojadYNqjTkV1KdY4GQnxfG-hrxr-FkmQknvYjJo,127 21 | elasticsearch/connection/esthrift/Rest.py,sha256=OPIqP9KqsokuAcUnwvBXZaVUGr1qQTs4AicYoOIMGFI,6561 22 | elasticsearch/connection/esthrift/constants.py,sha256=hz320xjF2ljmXqRIVWUkjJMXnT9BegVGF2K7hnqsdHs,276 23 | elasticsearch/connection/esthrift/ttypes.py,sha256=XDpDtdxrrvBVMMXsiu3jLONRdYz8rcHJ9GSuJz0MPzc,12040 24 | elasticsearch/connection/esthrift/__init__.py,sha256=oX5iaOEMn-aGOlg5vEc9N8Fid1iNh0zWE5WvNgi7ysI,42 25 | elasticsearch/helpers/test.py,sha256=iyaFosPjWYupZOcpk6kYdIzpw6-qNGkfWuK-G-iFtYQ,1839 26 | elasticsearch/helpers/__init__.py,sha256=pQhagNrZI-N9pqdgPt5eFFFILKMThvyhq__1eTQ0brQ,14052 27 | elasticsearch-2.2.0.dist-info/pbr.json,sha256=7LDsmS2o3dnopaClq6T03zl5eegwepeyDWzJEMMl-Jg,47 28 | elasticsearch-2.2.0.dist-info/top_level.txt,sha256=Jp2bLWq49skvCN4YCZsg1Hfn_NDLgleC-x-Bn01_HgM,14 29 | elasticsearch-2.2.0.dist-info/WHEEL,sha256=AvR0WeTpDaxT645bl5FQxUK6NPsTls2ttpcGJg3j1Xg,110 30 | elasticsearch-2.2.0.dist-info/METADATA,sha256=lDAm41FzDBFrm537DYEpGSUlma2UUcijQkuTvmyCADQ,5261 31 | elasticsearch-2.2.0.dist-info/DESCRIPTION.rst,sha256=oVlNdYwfM0G2v0gLd_xVaUcGpbX5n4cmTM96JZb9DWo,4221 32 | elasticsearch-2.2.0.dist-info/metadata.json,sha256=hWhX3zVXpCY1lIXYrdvVzTrrpZ5W11LD5W_dJYLdKoc,1272 33 | elasticsearch-2.2.0.dist-info/RECORD,, 34 | elasticsearch/transport.pyc,, 35 | elasticsearch/client/cluster.pyc,, 36 | elasticsearch/connection/pooling.pyc,, 37 | elasticsearch/compat.pyc,, 38 | elasticsearch/client/utils.pyc,, 39 | elasticsearch/client/snapshot.pyc,, 40 | elasticsearch/connection/esthrift/ttypes.pyc,, 41 | elasticsearch/client/indices.pyc,, 42 | elasticsearch/connection/__init__.pyc,, 43 | elasticsearch/connection/esthrift/__init__.pyc,, 44 | elasticsearch/__init__.pyc,, 45 | elasticsearch/connection/thrift.pyc,, 46 | elasticsearch/client/nodes.pyc,, 47 | elasticsearch/connection/esthrift/Rest.pyc,, 48 | elasticsearch/client/__init__.pyc,, 49 | elasticsearch/client/cat.pyc,, 50 | elasticsearch/connection/http_urllib3.pyc,, 51 | elasticsearch/helpers/__init__.pyc,, 52 | elasticsearch/connection/memcached.pyc,, 53 | elasticsearch/serializer.pyc,, 54 | elasticsearch/connection_pool.pyc,, 55 | elasticsearch/connection/esthrift/constants.pyc,, 56 | elasticsearch/connection/http_requests.pyc,, 57 | elasticsearch/exceptions.pyc,, 58 | elasticsearch/connection/base.pyc,, 59 | elasticsearch/helpers/test.pyc,, 60 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch-2.2.0.dist-info/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.24.0) 3 | Root-Is-Purelib: true 4 | Tag: py2-none-any 5 | Tag: py3-none-any 6 | 7 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch-2.2.0.dist-info/metadata.json: -------------------------------------------------------------------------------- 1 | {"test_requires": [{"requires": ["requests (>=1.0.0,<3.0.0)", "nose", "coverage", "mock", "pyaml", "nosexcover"]}], "generator": "bdist_wheel (0.24.0)", "extensions": {"python.details": {"contacts": [{"email": "honza.kral@gmail.com", "role": "author", "name": "Honza Kr\u00e1l"}], "project_urls": {"Home": "https://github.com/elastic/elasticsearch-py"}, "document_names": {"description": "DESCRIPTION.rst"}}}, "name": "elasticsearch", "version": "2.2.0", "classifiers": ["Development Status :: 5 - Production/Stable", "License :: OSI Approved :: Apache Software License", "Intended Audience :: Developers", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.2", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy"], "run_requires": [{"requires": ["urllib3 (>=1.8,<2.0)"]}], "extras": [], "license": "Apache License, Version 2.0", "summary": "Python client for Elasticsearch", "metadata_version": "2.0"} -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch-2.2.0.dist-info/pbr.json: -------------------------------------------------------------------------------- 1 | {"git_version": "14f5b35", "is_release": false} -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch-2.2.0.dist-info/top_level.txt: -------------------------------------------------------------------------------- 1 | elasticsearch 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | VERSION = (2, 2, 0) 4 | __version__ = VERSION 5 | __versionstr__ = '.'.join(map(str, VERSION)) 6 | 7 | import sys 8 | 9 | if (2, 7) <= sys.version_info < (3, 2): 10 | # On Python 2.7 and Python3 < 3.2, install no-op handler to silence 11 | # `No handlers could be found for logger "elasticsearch"` message per 12 | # 13 | import logging 14 | logger = logging.getLogger('elasticsearch') 15 | logger.addHandler(logging.NullHandler()) 16 | 17 | from .client import Elasticsearch 18 | from .transport import Transport 19 | from .connection_pool import ConnectionPool, ConnectionSelector, \ 20 | RoundRobinSelector 21 | from .serializer import JSONSerializer 22 | from .connection import Connection, RequestsHttpConnection, \ 23 | Urllib3HttpConnection 24 | from .exceptions import * 25 | 26 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/cat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/cat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/cluster.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/cluster.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/indices.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/indices.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/nodes.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/nodes.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/snapshot.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/snapshot.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import weakref 4 | from datetime import date, datetime 5 | from functools import wraps 6 | from ..compat import string_types, quote_plus 7 | 8 | # parts of URL to be omitted 9 | SKIP_IN_PATH = (None, '', b'', [], ()) 10 | 11 | def _escape(value): 12 | """ 13 | Escape a single value of a URL string or a query parameter. If it is a list 14 | or tuple, turn it into a comma-separated string first. 15 | """ 16 | 17 | # make sequences into comma-separated stings 18 | if isinstance(value, (list, tuple)): 19 | value = ','.join(value) 20 | 21 | # dates and datetimes into isoformat 22 | elif isinstance(value, (date, datetime)): 23 | value = value.isoformat() 24 | 25 | # make bools into true/false strings 26 | elif isinstance(value, bool): 27 | value = str(value).lower() 28 | 29 | # encode strings to utf-8 30 | if isinstance(value, string_types): 31 | try: 32 | return value.encode('utf-8') 33 | except UnicodeDecodeError: 34 | # Python 2 and str, no need to re-encode 35 | pass 36 | 37 | return str(value) 38 | 39 | def _make_path(*parts): 40 | """ 41 | Create a URL string from parts, omit all `None` values and empty strings. 42 | Convert lists nad tuples to comma separated values. 43 | """ 44 | #TODO: maybe only allow some parts to be lists/tuples ? 45 | return '/' + '/'.join( 46 | # preserve ',' and '*' in url for nicer URLs in logs 47 | quote_plus(_escape(p), b',*') for p in parts if p not in SKIP_IN_PATH) 48 | 49 | # parameters that apply to all methods 50 | GLOBAL_PARAMS = ('pretty', 'format', 'filter_path') 51 | 52 | def query_params(*es_query_params): 53 | """ 54 | Decorator that pops all accepted parameters from method's kwargs and puts 55 | them in the params argument. 56 | """ 57 | def _wrapper(func): 58 | @wraps(func) 59 | def _wrapped(*args, **kwargs): 60 | params = kwargs.pop('params', {}) 61 | for p in es_query_params + GLOBAL_PARAMS: 62 | if p in kwargs: 63 | params[p] = _escape(kwargs.pop(p)) 64 | 65 | # don't treat ignore and request_timeout as other params to avoid escaping 66 | for p in ('ignore', 'request_timeout'): 67 | if p in kwargs: 68 | params[p] = kwargs.pop(p) 69 | return func(*args, params=params, **kwargs) 70 | return _wrapped 71 | return _wrapper 72 | 73 | 74 | class NamespacedClient(object): 75 | def __init__(self, client): 76 | self.client = client 77 | 78 | @property 79 | def transport(self): 80 | return self.client.transport 81 | 82 | class AddonClient(NamespacedClient): 83 | @classmethod 84 | def infect_client(cls, client): 85 | addon = cls(weakref.proxy(client)) 86 | setattr(client, cls.namespace, addon) 87 | return client 88 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/client/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/client/utils.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/compat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | PY2 = sys.version_info[0] == 2 4 | 5 | if PY2: 6 | string_types = basestring, 7 | from urllib import quote_plus, urlencode 8 | from urlparse import urlparse 9 | from itertools import imap as map 10 | else: 11 | string_types = str, bytes 12 | from urllib.parse import quote_plus, urlencode, urlparse 13 | map = map 14 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/compat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/compat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import Connection 2 | from .http_requests import RequestsHttpConnection 3 | from .http_urllib3 import Urllib3HttpConnection 4 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | try: 3 | import simplejson as json 4 | except ImportError: 5 | import json 6 | 7 | from ..exceptions import TransportError, HTTP_EXCEPTIONS 8 | 9 | logger = logging.getLogger('elasticsearch') 10 | 11 | # create the elasticsearch.trace logger, but only set propagate to False if the 12 | # logger hasn't already been configured 13 | _tracer_already_configured = 'elasticsearch.trace' in logging.Logger.manager.loggerDict 14 | tracer = logging.getLogger('elasticsearch.trace') 15 | if not _tracer_already_configured: 16 | tracer.propagate = False 17 | 18 | 19 | class Connection(object): 20 | """ 21 | Class responsible for maintaining a connection to an Elasticsearch node. It 22 | holds persistent connection pool to it and it's main interface 23 | (`perform_request`) is thread-safe. 24 | 25 | Also responsible for logging. 26 | """ 27 | transport_schema = 'http' 28 | 29 | def __init__(self, host='localhost', port=9200, url_prefix='', timeout=10, **kwargs): 30 | """ 31 | :arg host: hostname of the node (default: localhost) 32 | :arg port: port to use (integer, default: 9200) 33 | :arg url_prefix: optional url prefix for elasticsearch 34 | :arg timeout: default timeout in seconds (float, default: 10) 35 | """ 36 | self.host = '%s://%s:%s' % (self.transport_schema, host, port) 37 | if url_prefix: 38 | url_prefix = '/' + url_prefix.strip('/') 39 | self.url_prefix = url_prefix 40 | self.timeout = timeout 41 | 42 | def __repr__(self): 43 | return '<%s: %s>' % (self.__class__.__name__, self.host) 44 | 45 | def log_request_success(self, method, full_url, path, body, status_code, response, duration): 46 | """ Log a successful API call. """ 47 | # TODO: optionally pass in params instead of full_url and do urlencode only when needed 48 | def _pretty_json(data): 49 | # pretty JSON in tracer curl logs 50 | try: 51 | return json.dumps(json.loads(data), sort_keys=True, indent=2, separators=(',', ': ')).replace("'", r'\u0027') 52 | except (ValueError, TypeError): 53 | # non-json data or a bulk request 54 | return data 55 | 56 | # body has already been serialized to utf-8, deserialize it for logging 57 | # TODO: find a better way to avoid (de)encoding the body back and forth 58 | if body: 59 | body = body.decode('utf-8') 60 | 61 | logger.info( 62 | '%s %s [status:%s request:%.3fs]', method, full_url, 63 | status_code, duration 64 | ) 65 | logger.debug('> %s', body) 66 | logger.debug('< %s', response) 67 | 68 | if tracer.isEnabledFor(logging.INFO): 69 | # include pretty in trace curls 70 | path = path.replace('?', '?pretty&', 1) if '?' in path else path + '?pretty' 71 | if self.url_prefix: 72 | path = path.replace(self.url_prefix, '', 1) 73 | tracer.info("curl -X%s 'http://localhost:9200%s' -d '%s'", method, path, _pretty_json(body) if body else '') 74 | 75 | if tracer.isEnabledFor(logging.DEBUG): 76 | tracer.debug('#[%s] (%.3fs)\n#%s', status_code, duration, _pretty_json(response).replace('\n', '\n#') if response else '') 77 | 78 | def log_request_fail(self, method, full_url, body, duration, status_code=None, exception=None): 79 | """ Log an unsuccessful API call. """ 80 | logger.warning( 81 | '%s %s [status:%s request:%.3fs]', method, full_url, 82 | status_code or 'N/A', duration, exc_info=exception is not None 83 | ) 84 | 85 | # body has already been serialized to utf-8, deserialize it for logging 86 | # TODO: find a better way to avoid (de)encoding the body back and forth 87 | if body: 88 | body = body.decode('utf-8') 89 | 90 | logger.debug('> %s', body) 91 | 92 | def _raise_error(self, status_code, raw_data): 93 | """ Locate appropriate exception and raise it. """ 94 | error_message = raw_data 95 | additional_info = None 96 | try: 97 | additional_info = json.loads(raw_data) 98 | error_message = additional_info.get('error', error_message) 99 | if isinstance(error_message, dict) and 'type' in error_message: 100 | error_message = error_message['type'] 101 | except: 102 | # we don't care what went wrong 103 | pass 104 | 105 | raise HTTP_EXCEPTIONS.get(status_code, TransportError)(status_code, error_message, additional_info) 106 | 107 | 108 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/base.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/base.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/Rest.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/Rest.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['ttypes', 'constants', 'Rest'] 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/constants.py: -------------------------------------------------------------------------------- 1 | # 2 | # Autogenerated by Thrift Compiler (0.9.0) 3 | # 4 | # DO NOT EDIT UNLESS YOU ARE SURE THAT YOU KNOW WHAT YOU ARE DOING 5 | # 6 | # options string: py:new_style=true,utf8strings=true 7 | # 8 | 9 | from thrift.Thrift import TType, TMessageType, TException, TApplicationException 10 | from ttypes import * 11 | 12 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/constants.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/constants.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/ttypes.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/esthrift/ttypes.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/http_requests.py: -------------------------------------------------------------------------------- 1 | import time 2 | import warnings 3 | try: 4 | import requests 5 | REQUESTS_AVAILABLE = True 6 | except ImportError: 7 | REQUESTS_AVAILABLE = False 8 | 9 | from .base import Connection 10 | from ..exceptions import ConnectionError, ImproperlyConfigured, ConnectionTimeout, SSLError 11 | from ..compat import urlencode, string_types 12 | 13 | class RequestsHttpConnection(Connection): 14 | """ 15 | Connection using the `requests` library. 16 | 17 | :arg http_auth: optional http auth information as either ':' separated 18 | string or a tuple. Any value will be passed into requests as `auth`. 19 | :arg use_ssl: use ssl for the connection if `True` 20 | :arg verify_certs: whether to verify SSL certificates 21 | :arg ca_certs: optional path to CA bundle. By default standard requests' 22 | bundle will be used. 23 | :arg client_cert: path to the file containing the private key and the 24 | certificate 25 | """ 26 | def __init__(self, host='localhost', port=9200, http_auth=None, 27 | use_ssl=False, verify_certs=False, ca_certs=None, client_cert=None, 28 | **kwargs): 29 | if not REQUESTS_AVAILABLE: 30 | raise ImproperlyConfigured("Please install requests to use RequestsHttpConnection.") 31 | 32 | super(RequestsHttpConnection, self).__init__(host= host, port=port, **kwargs) 33 | self.session = requests.session() 34 | if http_auth is not None: 35 | if isinstance(http_auth, (tuple, list)): 36 | http_auth = tuple(http_auth) 37 | elif isinstance(http_auth, string_types): 38 | http_auth = tuple(http_auth.split(':', 1)) 39 | self.session.auth = http_auth 40 | self.base_url = 'http%s://%s:%d%s' % ( 41 | 's' if use_ssl else '', 42 | host, port, self.url_prefix 43 | ) 44 | self.session.verify = verify_certs 45 | self.session.cert = client_cert 46 | if ca_certs: 47 | if not verify_certs: 48 | raise ImproperlyConfigured("You cannot pass CA certificates when verify SSL is off.") 49 | self.session.verify = ca_certs 50 | 51 | if use_ssl and not verify_certs: 52 | warnings.warn( 53 | 'Connecting to %s using SSL with verify_certs=False is insecure.' % self.base_url) 54 | 55 | def perform_request(self, method, url, params=None, body=None, timeout=None, ignore=()): 56 | url = self.base_url + url 57 | if params: 58 | url = '%s?%s' % (url, urlencode(params or {})) 59 | 60 | start = time.time() 61 | try: 62 | response = self.session.request(method, url, data=body, timeout=timeout or self.timeout) 63 | duration = time.time() - start 64 | raw_data = response.text 65 | except requests.exceptions.SSLError as e: 66 | self.log_request_fail(method, url, body, time.time() - start, exception=e) 67 | raise SSLError('N/A', str(e), e) 68 | except requests.Timeout as e: 69 | self.log_request_fail(method, url, body, time.time() - start, exception=e) 70 | raise ConnectionTimeout('TIMEOUT', str(e), e) 71 | except requests.ConnectionError as e: 72 | self.log_request_fail(method, url, body, time.time() - start, exception=e) 73 | raise ConnectionError('N/A', str(e), e) 74 | 75 | # raise errors based on http status codes, let the client handle those if needed 76 | if not (200 <= response.status_code < 300) and response.status_code not in ignore: 77 | self.log_request_fail(method, url, body, duration, response.status_code) 78 | self._raise_error(response.status_code, raw_data) 79 | 80 | self.log_request_success(method, url, response.request.path_url, body, response.status_code, raw_data, duration) 81 | 82 | return response.status_code, response.headers, raw_data 83 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/http_requests.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/http_requests.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/http_urllib3.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/http_urllib3.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/memcached.py: -------------------------------------------------------------------------------- 1 | import time 2 | try: 3 | import simplejson as json 4 | except ImportError: 5 | import json 6 | 7 | from ..exceptions import TransportError, ConnectionError, ImproperlyConfigured 8 | from ..compat import urlencode 9 | from .pooling import PoolingConnection 10 | 11 | class MemcachedConnection(PoolingConnection): 12 | """ 13 | Client using the `pylibmc` python library to communicate with elasticsearch 14 | using the memcached protocol. Requires plugin in the cluster. 15 | 16 | See https://github.com/elasticsearch/elasticsearch-transport-memcached for more details. 17 | """ 18 | transport_schema = 'memcached' 19 | 20 | method_map = { 21 | 'PUT': 'set', 22 | 'POST': 'set', 23 | 'DELETE': 'delete', 24 | 'HEAD': 'get', 25 | 'GET': 'get', 26 | } 27 | 28 | def __init__(self, host='localhost', port=11211, **kwargs): 29 | try: 30 | import pylibmc 31 | except ImportError: 32 | raise ImproperlyConfigured("You need to install pylibmc to use the MemcachedConnection class.") 33 | super(MemcachedConnection, self).__init__(host=host, port=port, **kwargs) 34 | self._make_connection = lambda: pylibmc.Client(['%s:%s' % (host, port)], behaviors={"tcp_nodelay": True}) 35 | 36 | def perform_request(self, method, url, params=None, body=None, timeout=None, ignore=()): 37 | mc = self._get_connection() 38 | url = self.url_prefix + url 39 | if params: 40 | url = '%s?%s' % (url, urlencode(params or {})) 41 | full_url = self.host + url 42 | 43 | mc_method = self.method_map.get(method, 'get') 44 | 45 | start = time.time() 46 | try: 47 | status = 200 48 | if mc_method == 'set': 49 | # no response from set commands 50 | response = '' 51 | if not json.dumps(mc.set(url, body)): 52 | status = 500 53 | else: 54 | response = mc.get(url) 55 | 56 | duration = time.time() - start 57 | if response: 58 | response = response.decode('utf-8') 59 | except Exception as e: 60 | self.log_request_fail(method, full_url, body, time.time() - start, exception=e) 61 | raise ConnectionError('N/A', str(e), e) 62 | finally: 63 | self._release_connection(mc) 64 | 65 | # try not to load the json every time 66 | if response and response[0] == '{' and ('"status"' in response or '"error"' in response): 67 | data = json.loads(response) 68 | if 'status' in data and isinstance(data['status'], int): 69 | status = data['status'] 70 | elif 'error' in data: 71 | raise TransportError('N/A', data['error']) 72 | 73 | if not (200 <= status < 300) and status not in ignore: 74 | self.log_request_fail(method, url, body, duration, status) 75 | self._raise_error(status, response) 76 | 77 | self.log_request_success(method, full_url, url, body, status, 78 | response, duration) 79 | 80 | return status, {}, response 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/memcached.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/memcached.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/pooling.py: -------------------------------------------------------------------------------- 1 | try: 2 | import queue 3 | except ImportError: 4 | import Queue as queue 5 | from .base import Connection 6 | 7 | 8 | class PoolingConnection(Connection): 9 | """ 10 | Base connection class for connections that use libraries without thread 11 | safety and no capacity for connection pooling. To use this just implement a 12 | ``_make_connection`` method that constructs a new connection and returns 13 | it. 14 | """ 15 | def __init__(self, *args, **kwargs): 16 | self._free_connections = queue.Queue() 17 | super(PoolingConnection, self).__init__(*args, **kwargs) 18 | 19 | def _get_connection(self): 20 | try: 21 | return self._free_connections.get_nowait() 22 | except queue.Empty: 23 | return self._make_connection() 24 | 25 | def _release_connection(self, con): 26 | self._free_connections.put(con) 27 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/pooling.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/pooling.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/thrift.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from socket import timeout as SocketTimeout 3 | from socket import error as SocketError 4 | import time 5 | import logging 6 | 7 | try: 8 | from .esthrift import Rest 9 | from .esthrift.ttypes import Method, RestRequest 10 | 11 | from thrift.transport import TTransport, TSocket, TSSLSocket 12 | from thrift.protocol import TBinaryProtocol 13 | from thrift.Thrift import TException 14 | THRIFT_AVAILABLE = True 15 | except ImportError: 16 | THRIFT_AVAILABLE = False 17 | 18 | from ..exceptions import ConnectionError, ImproperlyConfigured, ConnectionTimeout 19 | from .pooling import PoolingConnection 20 | 21 | logger = logging.getLogger('elasticsearch') 22 | 23 | class ThriftConnection(PoolingConnection): 24 | """ 25 | This connection class is deprecated and may be removed in future versions. 26 | 27 | Connection using the `thrift` protocol to communicate with elasticsearch. 28 | 29 | See https://github.com/elasticsearch/elasticsearch-transport-thrift for additional info. 30 | """ 31 | transport_schema = 'thrift' 32 | 33 | def __init__(self, host='localhost', port=9500, framed_transport=False, use_ssl=False, **kwargs): 34 | """ 35 | :arg framed_transport: use `TTransport.TFramedTransport` instead of 36 | `TTransport.TBufferedTransport` 37 | """ 38 | if not THRIFT_AVAILABLE: 39 | raise ImproperlyConfigured("Thrift is not available.") 40 | 41 | super(ThriftConnection, self).__init__(host=host, port=port, **kwargs) 42 | self._framed_transport = framed_transport 43 | self._tsocket_class = TSocket.TSocket 44 | if use_ssl: 45 | self._tsocket_class = TSSLSocket.TSSLSocket 46 | self._tsocket_args = (host, port) 47 | 48 | def _make_connection(self): 49 | socket = self._tsocket_class(*self._tsocket_args) 50 | socket.setTimeout(self.timeout * 1000.0) 51 | if self._framed_transport: 52 | transport = TTransport.TFramedTransport(socket) 53 | else: 54 | transport = TTransport.TBufferedTransport(socket) 55 | 56 | protocol = TBinaryProtocol.TBinaryProtocolAccelerated(transport) 57 | client = Rest.Client(protocol) 58 | client.transport = transport 59 | transport.open() 60 | return client 61 | 62 | def perform_request(self, method, url, params=None, body=None, timeout=None, ignore=()): 63 | request = RestRequest(method=Method._NAMES_TO_VALUES[method.upper()], uri=url, 64 | parameters=params, body=body) 65 | 66 | start = time.time() 67 | tclient = None 68 | try: 69 | tclient = self._get_connection() 70 | response = tclient.execute(request) 71 | duration = time.time() - start 72 | except SocketTimeout as e: 73 | self.log_request_fail(method, url, body, time.time() - start, exception=e) 74 | raise ConnectionTimeout('TIMEOUT', str(e), e) 75 | except (TException, SocketError) as e: 76 | self.log_request_fail(method, url, body, time.time() - start, exception=e) 77 | if tclient: 78 | try: 79 | # try closing transport socket 80 | tclient.transport.close() 81 | except Exception as e: 82 | logger.warning( 83 | 'Exception %s occured when closing a failed thrift connection.', 84 | e, exc_info=True 85 | ) 86 | raise ConnectionError('N/A', str(e), e) 87 | 88 | self._release_connection(tclient) 89 | 90 | if not (200 <= response.status < 300) and response.status not in ignore: 91 | self.log_request_fail(method, url, body, duration, response.status) 92 | self._raise_error(response.status, response.body) 93 | 94 | self.log_request_success(method, url, url, body, response.status, 95 | response.body, duration) 96 | 97 | headers = {} 98 | if response.headers: 99 | headers = dict((k.lower(), v) for k, v in response.headers.items()) 100 | return response.status, headers, response.body or '' 101 | 102 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection/thrift.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection/thrift.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/connection_pool.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/connection_pool.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/exceptions.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'ImproperlyConfigured', 'ElasticsearchException', 'SerializationError', 3 | 'TransportError', 'NotFoundError', 'ConflictError', 'RequestError', 'ConnectionError', 4 | 'SSLError', 'ConnectionTimeout' 5 | ] 6 | 7 | class ImproperlyConfigured(Exception): 8 | """ 9 | Exception raised when the config passed to the client is inconsistent or invalid. 10 | """ 11 | 12 | 13 | class ElasticsearchException(Exception): 14 | """ 15 | Base class for all exceptions raised by this package's operations (doesn't 16 | apply to :class:`~elasticsearch.ImproperlyConfigured`). 17 | """ 18 | 19 | 20 | class SerializationError(ElasticsearchException): 21 | """ 22 | Data passed in failed to serialize properly in the ``Serializer`` being 23 | used. 24 | """ 25 | 26 | 27 | class TransportError(ElasticsearchException): 28 | """ 29 | Exception raised when ES returns a non-OK (>=400) HTTP status code. Or when 30 | an actual connection error happens; in that case the ``status_code`` will 31 | be set to ``'N/A'``. 32 | """ 33 | @property 34 | def status_code(self): 35 | """ 36 | The HTTP status code of the response that precipitated the error or 37 | ``'N/A'`` if not applicable. 38 | """ 39 | return self.args[0] 40 | 41 | @property 42 | def error(self): 43 | """ A string error message. """ 44 | return self.args[1] 45 | 46 | @property 47 | def info(self): 48 | """ Dict of returned error info from ES, where available. """ 49 | return self.args[2] 50 | 51 | def __str__(self): 52 | cause = '' 53 | try: 54 | if self.info: 55 | cause = ', %r' % self.info['error']['root_cause'][0]['reason'] 56 | except LookupError: 57 | pass 58 | return 'TransportError(%s, %r%s)' % (self.status_code, self.error, cause) 59 | 60 | 61 | class ConnectionError(TransportError): 62 | """ 63 | Error raised when there was an exception while talking to ES. Original 64 | exception from the underlying :class:`~elasticsearch.Connection` 65 | implementation is available as ``.info.`` 66 | """ 67 | def __str__(self): 68 | return 'ConnectionError(%s) caused by: %s(%s)' % ( 69 | self.error, self.info.__class__.__name__, self.info) 70 | 71 | 72 | class SSLError(ConnectionError): 73 | """ Error raised when encountering SSL errors. """ 74 | 75 | 76 | class ConnectionTimeout(ConnectionError): 77 | """ A network timeout. Doesn't cause a node retry by default. """ 78 | def __str__(self): 79 | return 'ConnectionTimeout caused by - %s(%s)' % ( 80 | self.info.__class__.__name__, self.info) 81 | 82 | 83 | class NotFoundError(TransportError): 84 | """ Exception representing a 404 status code. """ 85 | 86 | 87 | class ConflictError(TransportError): 88 | """ Exception representing a 409 status code. """ 89 | 90 | 91 | class RequestError(TransportError): 92 | """ Exception representing a 400 status code. """ 93 | 94 | 95 | class AuthenticationException(TransportError): 96 | """ Exception representing a 401 status code. """ 97 | 98 | 99 | class AuthorizationException(TransportError): 100 | """ Exception representing a 403 status code. """ 101 | 102 | # more generic mappings from status_code to python exceptions 103 | HTTP_EXCEPTIONS = { 104 | 400: RequestError, 105 | 401: AuthenticationException, 106 | 403: AuthorizationException, 107 | 404: NotFoundError, 108 | 409: ConflictError, 109 | } 110 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/exceptions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/exceptions.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/helpers/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/helpers/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/helpers/test.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | try: 4 | # python 2.6 5 | from unittest2 import TestCase, SkipTest 6 | except ImportError: 7 | from unittest import TestCase, SkipTest 8 | 9 | from elasticsearch import Elasticsearch 10 | from elasticsearch.exceptions import ConnectionError 11 | 12 | def get_test_client(nowait=False, **kwargs): 13 | # construct kwargs from the environment 14 | kw = {'timeout': 30} 15 | if 'TEST_ES_CONNECTION' in os.environ: 16 | from elasticsearch import connection 17 | kw['connection_class'] = getattr(connection, os.environ['TEST_ES_CONNECTION']) 18 | 19 | kw.update(kwargs) 20 | client = Elasticsearch([os.environ.get('TEST_ES_SERVER', {})], **kw) 21 | 22 | # wait for yellow status 23 | for _ in range(1 if nowait else 100): 24 | try: 25 | client.cluster.health(wait_for_status='yellow') 26 | return client 27 | except ConnectionError: 28 | time.sleep(.1) 29 | else: 30 | # timeout 31 | raise SkipTest("Elasticsearch failed to start.") 32 | 33 | def _get_version(version_string): 34 | if '.' not in version_string: 35 | return () 36 | version = version_string.strip().split('.') 37 | return tuple(int(v) if v.isdigit() else 999 for v in version) 38 | 39 | class ElasticsearchTestCase(TestCase): 40 | @staticmethod 41 | def _get_client(): 42 | return get_test_client() 43 | 44 | @classmethod 45 | def setUpClass(cls): 46 | super(ElasticsearchTestCase, cls).setUpClass() 47 | cls.client = cls._get_client() 48 | 49 | def tearDown(self): 50 | super(ElasticsearchTestCase, self).tearDown() 51 | self.client.indices.delete(index='*') 52 | self.client.indices.delete_template(name='*', ignore=404) 53 | 54 | @property 55 | def es_version(self): 56 | if not hasattr(self, '_es_version'): 57 | version_string = self.client.info()['version']['number'] 58 | self._es_version = _get_version(version_string) 59 | return self._es_version 60 | 61 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/helpers/test.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/helpers/test.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/serializer.py: -------------------------------------------------------------------------------- 1 | try: 2 | import simplejson as json 3 | except ImportError: 4 | import json 5 | import uuid 6 | from datetime import date, datetime 7 | from decimal import Decimal 8 | 9 | from .exceptions import SerializationError, ImproperlyConfigured 10 | from .compat import string_types 11 | 12 | class TextSerializer(object): 13 | mimetype = 'text/plain' 14 | 15 | def loads(self, s): 16 | return s 17 | 18 | def dumps(self, data): 19 | if isinstance(data, string_types): 20 | return data 21 | 22 | raise SerializationError('Cannot serialize %r into text.' % data) 23 | 24 | class JSONSerializer(object): 25 | mimetype = 'application/json' 26 | 27 | def default(self, data): 28 | if isinstance(data, (date, datetime)): 29 | return data.isoformat() 30 | elif isinstance(data, Decimal): 31 | return float(data) 32 | elif isinstance(data, uuid.UUID): 33 | return str(data) 34 | raise TypeError("Unable to serialize %r (type: %s)" % (data, type(data))) 35 | 36 | def loads(self, s): 37 | try: 38 | return json.loads(s) 39 | except (ValueError, TypeError) as e: 40 | raise SerializationError(s, e) 41 | 42 | def dumps(self, data): 43 | # don't serialize strings 44 | if isinstance(data, string_types): 45 | return data 46 | 47 | try: 48 | return json.dumps(data, default=self.default, ensure_ascii=False) 49 | except (ValueError, TypeError) as e: 50 | raise SerializationError(data, e) 51 | 52 | DEFAULT_SERIALIZERS = { 53 | JSONSerializer.mimetype: JSONSerializer(), 54 | TextSerializer.mimetype: TextSerializer(), 55 | } 56 | 57 | class Deserializer(object): 58 | def __init__(self, serializers, default_mimetype='application/json'): 59 | try: 60 | self.default = serializers[default_mimetype] 61 | except KeyError: 62 | raise ImproperlyConfigured('Cannot find default serializer (%s)' % default_mimetype) 63 | self.serializers = serializers 64 | 65 | def loads(self, s, mimetype=None): 66 | if not mimetype: 67 | deserializer = self.default 68 | else: 69 | # split out charset 70 | mimetype = mimetype.split(';', 1)[0] 71 | try: 72 | deserializer = self.serializers[mimetype] 73 | except KeyError: 74 | raise SerializationError('Unknown mimetype, unable to deserialize: %s' % mimetype) 75 | 76 | return deserializer.loads(s) 77 | 78 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/serializer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/serializer.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/elasticsearch/transport.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/elasticsearch/transport.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/lambda_function.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import json 4 | import urllib 5 | import boto3 6 | import slate # using a specific version of PDFminer due to incompatibilities of certain versions 7 | import elasticsearch 8 | import datetime 9 | 10 | es_endpoint = 'search-mattsona-pdf-repo-2vzllafnl4d5oeu647oyu6yy6i.us-west-2.es.amazonaws.com' 11 | es_index = 'pdf_text_extracts' 12 | es_type = 'document' 13 | 14 | print('Loading function') 15 | 16 | s3 = boto3.client('s3') 17 | 18 | # prepare a dict to hold our document data 19 | doc_data = {} 20 | doc_data['insert_time'] = str(datetime.datetime.isoformat(datetime.datetime.now())) 21 | 22 | 23 | def lambda_handler(event, context): 24 | #print("Received event: " + json.dumps(event, indent=2)) 25 | 26 | # Get the object from the event and show its content type 27 | bucket = event['Records'][0]['s3']['bucket']['name'] 28 | object_key = urllib.unquote_plus(event['Records'][0]['s3']['object']['key']).decode('utf8') 29 | try: 30 | # get the file data from s3 31 | temp_pdf_file = open('/tmp/tempfile.pdf', 'w') # create a file handler for the temporary file 32 | response = s3.get_object(Bucket=bucket, Key=object_key) 33 | print("CONTENT TYPE: " + response['ContentType']) 34 | # return response['ContentType'] 35 | temp_pdf_file.write(response['Body'].read()) # write the object data to a local file; will be passed to slate 36 | temp_pdf_file.close() # close the temporary file for now 37 | 38 | # pull the text from the temporary PDF file using slate 39 | print("Extracting data from: " + object_key) 40 | with open('/tmp/tempfile.pdf') as temp_pdf_file: 41 | 42 | doc = slate.PDF(temp_pdf_file) 43 | 44 | # store document data to dict 45 | doc_data['source_pdf_name'] = object_key 46 | doc_data['document_text'] = doc[0] # we're only worried about page 1 at this point 47 | 48 | except Exception as e: 49 | print(e) 50 | print('Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(object_key, bucket)) 51 | raise e 52 | 53 | # put the data in ES 54 | try: 55 | es = elasticsearch.Elasticsearch([{'host': es_endpoint, 'port': 443, 'use_ssl': True}]) # hold off on validating certs 56 | es_response = es.index(index=es_index, doc_type=es_type, body=doc_data) 57 | print('Data posted to ES: ' + str(es_response)) 58 | 59 | except Exception as e: 60 | print('Data post to ES failed: ' + str(e)) 61 | raise e 62 | 63 | return "Done" 64 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/lambda_function.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/lambda_function.zip -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer-20110515-py2.7.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: pdfminer 3 | Version: 20110515 4 | Summary: PDF parser and analyzer 5 | Home-page: http://www.unixuser.org/~euske/python/pdfminer/index.html 6 | Author: Yusuke Shinyama 7 | Author-email: yusuke at cs dot nyu dot edu 8 | License: MIT/X 9 | Description: PDFMiner is a tool for extracting information from PDF documents. 10 | Unlike other PDF-related tools, it focuses entirely on getting 11 | and analyzing text data. PDFMiner allows to obtain 12 | the exact location of texts in a page, as well as 13 | other information such as fonts or lines. 14 | It includes a PDF converter that can transform PDF files 15 | into other text formats (such as HTML). It has an extensible 16 | PDF parser that can be used for other purposes instead of text analysis. 17 | Keywords: pdf parser,pdf converter,layout analysis,text mining 18 | Platform: UNKNOWN 19 | Classifier: Development Status :: 4 - Beta 20 | Classifier: Environment :: Console 21 | Classifier: Intended Audience :: Developers 22 | Classifier: Intended Audience :: Science/Research 23 | Classifier: License :: OSI Approved :: MIT License 24 | Classifier: Topic :: Text Processing 25 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer-20110515-py2.7.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.txt 2 | pdfminer/__init__.py 3 | pdfminer/arcfour.py 4 | pdfminer/ascii85.py 5 | pdfminer/cmapdb.py 6 | pdfminer/converter.py 7 | pdfminer/encodingdb.py 8 | pdfminer/fontmetrics.py 9 | pdfminer/glyphlist.py 10 | pdfminer/latin_enc.py 11 | pdfminer/layout.py 12 | pdfminer/lzw.py 13 | pdfminer/pdfcolor.py 14 | pdfminer/pdfdevice.py 15 | pdfminer/pdffont.py 16 | pdfminer/pdfinterp.py 17 | pdfminer/pdfparser.py 18 | pdfminer/pdftypes.py 19 | pdfminer/psparser.py 20 | pdfminer/rijndael.py 21 | pdfminer/runlength.py 22 | pdfminer/utils.py 23 | pdfminer.egg-info/PKG-INFO 24 | pdfminer.egg-info/SOURCES.txt 25 | pdfminer.egg-info/dependency_links.txt 26 | pdfminer.egg-info/top_level.txt 27 | pdfminer/cmap/__init__.py 28 | tools/dumppdf.py 29 | tools/latin2ascii.py 30 | tools/pdf2txt.py -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer-20110515-py2.7.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer-20110515-py2.7.egg-info/installed-files.txt: -------------------------------------------------------------------------------- 1 | ../pdfminer/__init__.py 2 | ../pdfminer/arcfour.py 3 | ../pdfminer/ascii85.py 4 | ../pdfminer/cmapdb.py 5 | ../pdfminer/converter.py 6 | ../pdfminer/encodingdb.py 7 | ../pdfminer/fontmetrics.py 8 | ../pdfminer/glyphlist.py 9 | ../pdfminer/latin_enc.py 10 | ../pdfminer/layout.py 11 | ../pdfminer/lzw.py 12 | ../pdfminer/pdfcolor.py 13 | ../pdfminer/pdfdevice.py 14 | ../pdfminer/pdffont.py 15 | ../pdfminer/pdfinterp.py 16 | ../pdfminer/pdfparser.py 17 | ../pdfminer/pdftypes.py 18 | ../pdfminer/psparser.py 19 | ../pdfminer/rijndael.py 20 | ../pdfminer/runlength.py 21 | ../pdfminer/utils.py 22 | ../pdfminer/cmap/__init__.py 23 | ../pdfminer/__init__.pyc 24 | ../pdfminer/arcfour.pyc 25 | ../pdfminer/ascii85.pyc 26 | ../pdfminer/cmapdb.pyc 27 | ../pdfminer/converter.pyc 28 | ../pdfminer/encodingdb.pyc 29 | ../pdfminer/fontmetrics.pyc 30 | ../pdfminer/glyphlist.pyc 31 | ../pdfminer/latin_enc.pyc 32 | ../pdfminer/layout.pyc 33 | ../pdfminer/lzw.pyc 34 | ../pdfminer/pdfcolor.pyc 35 | ../pdfminer/pdfdevice.pyc 36 | ../pdfminer/pdffont.pyc 37 | ../pdfminer/pdfinterp.pyc 38 | ../pdfminer/pdfparser.pyc 39 | ../pdfminer/pdftypes.pyc 40 | ../pdfminer/psparser.pyc 41 | ../pdfminer/rijndael.pyc 42 | ../pdfminer/runlength.pyc 43 | ../pdfminer/utils.pyc 44 | ../pdfminer/cmap/__init__.pyc 45 | ./ 46 | dependency_links.txt 47 | PKG-INFO 48 | SOURCES.txt 49 | top_level.txt 50 | ../../../bin/dumppdf.py 51 | ../../../bin/latin2ascii.py 52 | ../../../bin/pdf2txt.py 53 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer-20110515-py2.7.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | pdfminer 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | __version__ = '20110515' 3 | 4 | if __name__ == '__main__': print __version__ 5 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/arcfour.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | """ Python implementation of Arcfour encryption algorithm. 4 | 5 | This code is in the public domain. 6 | 7 | """ 8 | 9 | ## Arcfour 10 | ## 11 | class Arcfour(object): 12 | 13 | """ 14 | >>> Arcfour('Key').process('Plaintext').encode('hex') 15 | 'bbf316e8d940af0ad3' 16 | >>> Arcfour('Wiki').process('pedia').encode('hex') 17 | '1021bf0420' 18 | >>> Arcfour('Secret').process('Attack at dawn').encode('hex') 19 | '45a01f645fc35b383552544b9bf5' 20 | """ 21 | 22 | def __init__(self, key): 23 | s = range(256) 24 | j = 0 25 | klen = len(key) 26 | for i in xrange(256): 27 | j = (j + s[i] + ord(key[i % klen])) % 256 28 | (s[i], s[j]) = (s[j], s[i]) 29 | self.s = s 30 | (self.i, self.j) = (0, 0) 31 | return 32 | 33 | def process(self, data): 34 | (i, j) = (self.i, self.j) 35 | s = self.s 36 | r = '' 37 | for c in data: 38 | i = (i+1) % 256 39 | j = (j+s[i]) % 256 40 | (s[i], s[j]) = (s[j], s[i]) 41 | k = s[(s[i]+s[j]) % 256] 42 | r += chr(ord(c) ^ k) 43 | (self.i, self.j) = (i, j) 44 | return r 45 | 46 | # test 47 | if __name__ == '__main__': 48 | import doctest 49 | doctest.testmod() 50 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/arcfour.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/arcfour.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/ascii85.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | """ Python implementation of ASCII85/ASCIIHex decoder (Adobe version). 4 | 5 | This code is in the public domain. 6 | 7 | """ 8 | 9 | import re 10 | import struct 11 | 12 | # ascii85decode(data) 13 | def ascii85decode(data): 14 | """ 15 | In ASCII85 encoding, every four bytes are encoded with five ASCII 16 | letters, using 85 different types of characters (as 256**4 < 85**5). 17 | When the length of the original bytes is not a multiple of 4, a special 18 | rule is used for round up. 19 | 20 | The Adobe's ASCII85 implementation is slightly different from 21 | its original in handling the last characters. 22 | 23 | The sample string is taken from: 24 | http://en.wikipedia.org/w/index.php?title=Ascii85 25 | 26 | >>> ascii85decode('9jqo^BlbD-BleB1DJ+*+F(f,q') 27 | 'Man is distinguished' 28 | >>> ascii85decode('E,9)oF*2M7/c~>') 29 | 'pleasure.' 30 | """ 31 | n = b = 0 32 | out = '' 33 | for c in data: 34 | if '!' <= c and c <= 'u': 35 | n += 1 36 | b = b*85+(ord(c)-33) 37 | if n == 5: 38 | out += struct.pack('>L',b) 39 | n = b = 0 40 | elif c == 'z': 41 | assert n == 0 42 | out += '\0\0\0\0' 43 | elif c == '~': 44 | if n: 45 | for _ in range(5-n): 46 | b = b*85+84 47 | out += struct.pack('>L',b)[:n-1] 48 | break 49 | return out 50 | 51 | # asciihexdecode(data) 52 | hex_re = re.compile(r'([a-f\d]{2})', re.IGNORECASE) 53 | trail_re = re.compile(r'^(?:[a-f\d]{2}|\s)*([a-f\d])[\s>]*$', re.IGNORECASE) 54 | def asciihexdecode(data): 55 | """ 56 | ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 57 | For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the 58 | ASCIIHexDecode filter produces one byte of binary data. All white-space 59 | characters are ignored. A right angle bracket character (>) indicates 60 | EOD. Any other characters will cause an error. If the filter encounters 61 | the EOD marker after reading an odd number of hexadecimal digits, it 62 | will behave as if a 0 followed the last digit. 63 | 64 | >>> asciihexdecode('61 62 2e6364 65') 65 | 'ab.cde' 66 | >>> asciihexdecode('61 62 2e6364 657>') 67 | 'ab.cdep' 68 | >>> asciihexdecode('7>') 69 | 'p' 70 | """ 71 | decode = (lambda hx: chr(int(hx, 16))) 72 | out = map(decode, hex_re.findall(data)) 73 | m = trail_re.search(data) 74 | if m: 75 | out.append(decode("%c0" % m.group(1))) 76 | return ''.join(out) 77 | 78 | 79 | if __name__ == '__main__': 80 | import doctest 81 | doctest.testmod() 82 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/ascii85.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/ascii85.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/cmap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/cmap/__init__.py -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/cmap/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/cmap/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/cmapdb.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/cmapdb.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/converter.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/converter.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/encodingdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | import re 4 | from psparser import PSLiteral 5 | from glyphlist import glyphname2unicode 6 | from latin_enc import ENCODING 7 | 8 | 9 | ## name2unicode 10 | ## 11 | STRIP_NAME = re.compile(r'[0-9]+') 12 | def name2unicode(name): 13 | """Converts Adobe glyph names to Unicode numbers.""" 14 | if name in glyphname2unicode: 15 | return glyphname2unicode[name] 16 | m = STRIP_NAME.search(name) 17 | if not m: raise KeyError(name) 18 | return unichr(int(m.group(0))) 19 | 20 | 21 | ## EncodingDB 22 | ## 23 | class EncodingDB(object): 24 | 25 | std2unicode = {} 26 | mac2unicode = {} 27 | win2unicode = {} 28 | pdf2unicode = {} 29 | for (name,std,mac,win,pdf) in ENCODING: 30 | c = name2unicode(name) 31 | if std: std2unicode[std] = c 32 | if mac: mac2unicode[mac] = c 33 | if win: win2unicode[win] = c 34 | if pdf: pdf2unicode[pdf] = c 35 | 36 | encodings = { 37 | 'StandardEncoding': std2unicode, 38 | 'MacRomanEncoding': mac2unicode, 39 | 'WinAnsiEncoding': win2unicode, 40 | 'PDFDocEncoding': pdf2unicode, 41 | } 42 | 43 | @classmethod 44 | def get_encoding(klass, name, diff=None): 45 | cid2unicode = klass.encodings.get(name, klass.std2unicode) 46 | if diff: 47 | cid2unicode = cid2unicode.copy() 48 | cid = 0 49 | for x in diff: 50 | if isinstance(x, int): 51 | cid = x 52 | elif isinstance(x, PSLiteral): 53 | try: 54 | cid2unicode[cid] = name2unicode(x.name) 55 | except KeyError: 56 | pass 57 | cid += 1 58 | return cid2unicode 59 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/encodingdb.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/encodingdb.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/fontmetrics.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/fontmetrics.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/glyphlist.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/glyphlist.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/latin_enc.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/latin_enc.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/layout.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/layout.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/lzw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | import sys 3 | try: 4 | from cStringIO import StringIO 5 | except ImportError: 6 | from StringIO import StringIO 7 | 8 | 9 | ## LZWDecoder 10 | ## 11 | class LZWDecoder(object): 12 | 13 | debug = 0 14 | 15 | def __init__(self, fp): 16 | self.fp = fp 17 | self.buff = 0 18 | self.bpos = 8 19 | self.nbits = 9 20 | self.table = None 21 | self.prevbuf = None 22 | return 23 | 24 | def readbits(self, bits): 25 | v = 0 26 | while 1: 27 | # the number of remaining bits we can get from the current buffer. 28 | r = 8-self.bpos 29 | if bits <= r: 30 | # |-----8-bits-----| 31 | # |-bpos-|-bits-| | 32 | # | |----r----| 33 | v = (v<>(r-bits)) & ((1<>sys.stderr, ('nbits=%d, code=%d, output=%r, table=%r' % 87 | (self.nbits, code, x, self.table[258:])) 88 | return 89 | 90 | # lzwdecode 91 | def lzwdecode(data): 92 | """ 93 | >>> lzwdecode('\x80\x0b\x60\x50\x22\x0c\x0c\x85\x01') 94 | '\x2d\x2d\x2d\x2d\x2d\x41\x2d\x2d\x2d\x42' 95 | """ 96 | fp = StringIO(data) 97 | return ''.join(LZWDecoder(fp).run()) 98 | 99 | if __name__ == '__main__': 100 | import doctest 101 | doctest.testmod() 102 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/lzw.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/lzw.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdfcolor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | from psparser import LIT 3 | 4 | 5 | ## PDFColorSpace 6 | ## 7 | LITERAL_DEVICE_GRAY = LIT('DeviceGray') 8 | LITERAL_DEVICE_RGB = LIT('DeviceRGB') 9 | LITERAL_DEVICE_CMYK = LIT('DeviceCMYK') 10 | 11 | class PDFColorSpace(object): 12 | 13 | def __init__(self, name, ncomponents): 14 | self.name = name 15 | self.ncomponents = ncomponents 16 | return 17 | 18 | def __repr__(self): 19 | return '' % (self.name, self.ncomponents) 20 | 21 | 22 | PREDEFINED_COLORSPACE = dict( 23 | (name, PDFColorSpace(name,n)) for (name,n) in { 24 | 'CalRGB': 3, 25 | 'CalGray': 1, 26 | 'Lab': 3, 27 | 'DeviceRGB': 3, 28 | 'DeviceCMYK': 4, 29 | 'DeviceGray': 1, 30 | 'Separation': 1, 31 | 'Indexed': 1, 32 | 'Pattern': 1, 33 | }.iteritems()) 34 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdfcolor.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/pdfcolor.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdfdevice.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/pdfdevice.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdffont.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/pdffont.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdfinterp.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/pdfinterp.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdfparser.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/pdfparser.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/pdftypes.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/pdftypes.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/psparser.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/psparser.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/rijndael.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/rijndael.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/runlength.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # 3 | # RunLength decoder (Adobe version) implementation based on PDF Reference 4 | # version 1.4 section 3.3.4. 5 | # 6 | # * public domain * 7 | # 8 | 9 | import sys 10 | 11 | def rldecode(data): 12 | """ 13 | RunLength decoder (Adobe version) implementation based on PDF Reference 14 | version 1.4 section 3.3.4: 15 | The RunLengthDecode filter decodes data that has been encoded in a 16 | simple byte-oriented format based on run length. The encoded data 17 | is a sequence of runs, where each run consists of a length byte 18 | followed by 1 to 128 bytes of data. If the length byte is in the 19 | range 0 to 127, the following length + 1 (1 to 128) bytes are 20 | copied literally during decompression. If length is in the range 21 | 129 to 255, the following single byte is to be copied 257 - length 22 | (2 to 128) times during decompression. A length value of 128 23 | denotes EOD. 24 | >>> s = "\x05123456\xfa7\x04abcde\x80junk" 25 | >>> rldecode(s) 26 | '1234567777777abcde' 27 | """ 28 | decoded = [] 29 | i=0 30 | while i < len(data): 31 | #print "data[%d]=:%d:" % (i,ord(data[i])) 32 | length = ord(data[i]) 33 | if length == 128: 34 | break 35 | if length >= 0 and length < 128: 36 | run = data[i+1:(i+1)+(length+1)] 37 | #print "length=%d, run=%s" % (length+1,run) 38 | decoded.append(run) 39 | i = (i+1) + (length+1) 40 | if length > 128: 41 | run = data[i+1]*(257-length) 42 | #print "length=%d, run=%s" % (257-length,run) 43 | decoded.append(run) 44 | i = (i+1) + 1 45 | return ''.join(decoded) 46 | 47 | 48 | if __name__ == '__main__': 49 | import doctest 50 | doctest.testmod() 51 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/runlength.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/runlength.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pdfminer/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pdfminer/utils.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/__init__.py -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/__about__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Donald Stufft 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import absolute_import, division, print_function 15 | 16 | __all__ = [ 17 | "__title__", "__summary__", "__uri__", "__version__", "__author__", 18 | "__email__", "__license__", "__copyright__", 19 | ] 20 | 21 | __title__ = "packaging" 22 | __summary__ = "Core utilities for Python packages" 23 | __uri__ = "https://github.com/pypa/packaging" 24 | 25 | __version__ = "15.3" 26 | 27 | __author__ = "Donald Stufft" 28 | __email__ = "donald@stufft.io" 29 | 30 | __license__ = "Apache License, Version 2.0" 31 | __copyright__ = "Copyright 2014 %s" % __author__ 32 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/__about__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/__about__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Donald Stufft 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import absolute_import, division, print_function 15 | 16 | from .__about__ import ( 17 | __author__, __copyright__, __email__, __license__, __summary__, __title__, 18 | __uri__, __version__ 19 | ) 20 | 21 | __all__ = [ 22 | "__title__", "__summary__", "__uri__", "__version__", "__author__", 23 | "__email__", "__license__", "__copyright__", 24 | ] 25 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/_compat.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Donald Stufft 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import absolute_import, division, print_function 15 | 16 | import sys 17 | 18 | 19 | PY2 = sys.version_info[0] == 2 20 | PY3 = sys.version_info[0] == 3 21 | 22 | # flake8: noqa 23 | 24 | if PY3: 25 | string_types = str, 26 | else: 27 | string_types = basestring, 28 | 29 | 30 | def with_metaclass(meta, *bases): 31 | """ 32 | Create a base class with a metaclass. 33 | """ 34 | # This requires a bit of explanation: the basic idea is to make a dummy 35 | # metaclass for one level of class instantiation that replaces itself with 36 | # the actual metaclass. 37 | class metaclass(meta): 38 | def __new__(cls, name, this_bases, d): 39 | return meta(name, bases, d) 40 | return type.__new__(metaclass, 'temporary_class', (), {}) 41 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/_compat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/_compat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/_structures.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Donald Stufft 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from __future__ import absolute_import, division, print_function 15 | 16 | 17 | class Infinity(object): 18 | 19 | def __repr__(self): 20 | return "Infinity" 21 | 22 | def __hash__(self): 23 | return hash(repr(self)) 24 | 25 | def __lt__(self, other): 26 | return False 27 | 28 | def __le__(self, other): 29 | return False 30 | 31 | def __eq__(self, other): 32 | return isinstance(other, self.__class__) 33 | 34 | def __ne__(self, other): 35 | return not isinstance(other, self.__class__) 36 | 37 | def __gt__(self, other): 38 | return True 39 | 40 | def __ge__(self, other): 41 | return True 42 | 43 | def __neg__(self): 44 | return NegativeInfinity 45 | 46 | Infinity = Infinity() 47 | 48 | 49 | class NegativeInfinity(object): 50 | 51 | def __repr__(self): 52 | return "-Infinity" 53 | 54 | def __hash__(self): 55 | return hash(repr(self)) 56 | 57 | def __lt__(self, other): 58 | return True 59 | 60 | def __le__(self, other): 61 | return True 62 | 63 | def __eq__(self, other): 64 | return isinstance(other, self.__class__) 65 | 66 | def __ne__(self, other): 67 | return not isinstance(other, self.__class__) 68 | 69 | def __gt__(self, other): 70 | return False 71 | 72 | def __ge__(self, other): 73 | return False 74 | 75 | def __neg__(self): 76 | return Infinity 77 | 78 | NegativeInfinity = NegativeInfinity() 79 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/_structures.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/_structures.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/specifiers.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/specifiers.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/version.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/pkg_resources/_vendor/packaging/version.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools-19.2.dist-info/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.26.0) 3 | Root-Is-Purelib: true 4 | Tag: py2-none-any 5 | Tag: py3-none-any 6 | 7 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools-19.2.dist-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | https://pypi.python.org/packages/source/c/certifi/certifi-2015.11.20.tar.gz#md5=25134646672c695c1ff1593c2dd75d08 2 | https://pypi.python.org/packages/source/w/wincertstore/wincertstore-0.2.zip#md5=ae728f2f007185648d0c7a8679b361e2 3 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools-19.2.dist-info/entry_points.txt: -------------------------------------------------------------------------------- 1 | [console_scripts] 2 | easy_install = setuptools.command.easy_install:main 3 | easy_install-3.5 = setuptools.command.easy_install:main 4 | 5 | [distutils.commands] 6 | alias = setuptools.command.alias:alias 7 | bdist_egg = setuptools.command.bdist_egg:bdist_egg 8 | bdist_rpm = setuptools.command.bdist_rpm:bdist_rpm 9 | bdist_wininst = setuptools.command.bdist_wininst:bdist_wininst 10 | build_ext = setuptools.command.build_ext:build_ext 11 | build_py = setuptools.command.build_py:build_py 12 | develop = setuptools.command.develop:develop 13 | easy_install = setuptools.command.easy_install:easy_install 14 | egg_info = setuptools.command.egg_info:egg_info 15 | install = setuptools.command.install:install 16 | install_egg_info = setuptools.command.install_egg_info:install_egg_info 17 | install_lib = setuptools.command.install_lib:install_lib 18 | install_scripts = setuptools.command.install_scripts:install_scripts 19 | register = setuptools.command.register:register 20 | rotate = setuptools.command.rotate:rotate 21 | saveopts = setuptools.command.saveopts:saveopts 22 | sdist = setuptools.command.sdist:sdist 23 | setopt = setuptools.command.setopt:setopt 24 | test = setuptools.command.test:test 25 | upload_docs = setuptools.command.upload_docs:upload_docs 26 | 27 | [distutils.setup_keywords] 28 | convert_2to3_doctests = setuptools.dist:assert_string_list 29 | dependency_links = setuptools.dist:assert_string_list 30 | eager_resources = setuptools.dist:assert_string_list 31 | entry_points = setuptools.dist:check_entry_points 32 | exclude_package_data = setuptools.dist:check_package_data 33 | extras_require = setuptools.dist:check_extras 34 | include_package_data = setuptools.dist:assert_bool 35 | install_requires = setuptools.dist:check_requirements 36 | namespace_packages = setuptools.dist:check_nsp 37 | package_data = setuptools.dist:check_package_data 38 | packages = setuptools.dist:check_packages 39 | setup_requires = setuptools.dist:check_requirements 40 | test_loader = setuptools.dist:check_importable 41 | test_runner = setuptools.dist:check_importable 42 | test_suite = setuptools.dist:check_test_suite 43 | tests_require = setuptools.dist:check_requirements 44 | use_2to3 = setuptools.dist:assert_bool 45 | use_2to3_exclude_fixers = setuptools.dist:assert_string_list 46 | use_2to3_fixers = setuptools.dist:assert_string_list 47 | zip_safe = setuptools.dist:assert_bool 48 | 49 | [egg_info.writers] 50 | PKG-INFO = setuptools.command.egg_info:write_pkg_info 51 | dependency_links.txt = setuptools.command.egg_info:overwrite_arg 52 | depends.txt = setuptools.command.egg_info:warn_depends_obsolete 53 | eager_resources.txt = setuptools.command.egg_info:overwrite_arg 54 | entry_points.txt = setuptools.command.egg_info:write_entries 55 | namespace_packages.txt = setuptools.command.egg_info:overwrite_arg 56 | requires.txt = setuptools.command.egg_info:write_requirements 57 | top_level.txt = setuptools.command.egg_info:write_toplevel_names 58 | 59 | [setuptools.installation] 60 | eggsecutable = setuptools.command.easy_install:bootstrap 61 | 62 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools-19.2.dist-info/metadata.json: -------------------------------------------------------------------------------- 1 | {"generator": "bdist_wheel (0.26.0)", "summary": "Easily download, build, install, upgrade, and uninstall Python packages", "classifiers": ["Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "License :: OSI Approved :: Python Software Foundation License", "License :: OSI Approved :: Zope Public License", "Operating System :: OS Independent", "Programming Language :: Python :: 2.6", "Programming Language :: Python :: 2.7", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", "Programming Language :: Python :: 3.5", "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: System :: Archiving :: Packaging", "Topic :: System :: Systems Administration", "Topic :: Utilities"], "extensions": {"python.details": {"project_urls": {"Home": "https://bitbucket.org/pypa/setuptools"}, "contacts": [{"email": "distutils-sig@python.org", "name": "Python Packaging Authority", "role": "author"}], "document_names": {"description": "DESCRIPTION.rst"}}, "python.exports": {"console_scripts": {"easy_install": "setuptools.command.easy_install:main", "easy_install-3.5": "setuptools.command.easy_install:main"}, "distutils.commands": {"alias": "setuptools.command.alias:alias", "bdist_egg": "setuptools.command.bdist_egg:bdist_egg", "bdist_rpm": "setuptools.command.bdist_rpm:bdist_rpm", "bdist_wininst": "setuptools.command.bdist_wininst:bdist_wininst", "build_ext": "setuptools.command.build_ext:build_ext", "build_py": "setuptools.command.build_py:build_py", "develop": "setuptools.command.develop:develop", "easy_install": "setuptools.command.easy_install:easy_install", "egg_info": "setuptools.command.egg_info:egg_info", "install": "setuptools.command.install:install", "install_egg_info": "setuptools.command.install_egg_info:install_egg_info", "install_lib": "setuptools.command.install_lib:install_lib", "install_scripts": "setuptools.command.install_scripts:install_scripts", "register": "setuptools.command.register:register", "rotate": "setuptools.command.rotate:rotate", "saveopts": "setuptools.command.saveopts:saveopts", "sdist": "setuptools.command.sdist:sdist", "setopt": "setuptools.command.setopt:setopt", "test": "setuptools.command.test:test", "upload_docs": "setuptools.command.upload_docs:upload_docs"}, "distutils.setup_keywords": {"convert_2to3_doctests": "setuptools.dist:assert_string_list", "dependency_links": "setuptools.dist:assert_string_list", "eager_resources": "setuptools.dist:assert_string_list", "entry_points": "setuptools.dist:check_entry_points", "exclude_package_data": "setuptools.dist:check_package_data", "extras_require": "setuptools.dist:check_extras", "include_package_data": "setuptools.dist:assert_bool", "install_requires": "setuptools.dist:check_requirements", "namespace_packages": "setuptools.dist:check_nsp", "package_data": "setuptools.dist:check_package_data", "packages": "setuptools.dist:check_packages", "setup_requires": "setuptools.dist:check_requirements", "test_loader": "setuptools.dist:check_importable", "test_runner": "setuptools.dist:check_importable", "test_suite": "setuptools.dist:check_test_suite", "tests_require": "setuptools.dist:check_requirements", "use_2to3": "setuptools.dist:assert_bool", "use_2to3_exclude_fixers": "setuptools.dist:assert_string_list", "use_2to3_fixers": "setuptools.dist:assert_string_list", "zip_safe": "setuptools.dist:assert_bool"}, "egg_info.writers": {"PKG-INFO": "setuptools.command.egg_info:write_pkg_info", "dependency_links.txt": "setuptools.command.egg_info:overwrite_arg", "depends.txt": "setuptools.command.egg_info:warn_depends_obsolete", "eager_resources.txt": "setuptools.command.egg_info:overwrite_arg", "entry_points.txt": "setuptools.command.egg_info:write_entries", "namespace_packages.txt": "setuptools.command.egg_info:overwrite_arg", "requires.txt": "setuptools.command.egg_info:write_requirements", "top_level.txt": "setuptools.command.egg_info:write_toplevel_names"}, "setuptools.installation": {"eggsecutable": "setuptools.command.easy_install:bootstrap"}}, "python.commands": {"wrap_console": {"easy_install": "setuptools.command.easy_install:main", "easy_install-3.5": "setuptools.command.easy_install:main"}}}, "keywords": ["CPAN", "PyPI", "distutils", "eggs", "package", "management"], "license": "PSF or ZPL", "metadata_version": "2.0", "name": "setuptools", "extras": ["certs", "ssl"], "run_requires": [{"requires": ["certifi (==2015.11.20)"], "extra": "certs"}, {"requires": ["wincertstore (==0.2)"], "extra": "ssl", "environment": "sys_platform=='win32'"}], "version": "19.2", "test_requires": [{"requires": ["pytest (>=2.8)", "setuptools[ssl]"]}]} -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools-19.2.dist-info/top_level.txt: -------------------------------------------------------------------------------- 1 | _markerlib 2 | easy_install 3 | pkg_resources 4 | setuptools 5 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools-19.2.dist-info/zip-safe: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/archive_util.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/archive_util.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/cli-32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/cli-32.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/cli-64.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/cli-64.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/cli-arm-32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/cli-arm-32.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/cli.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/cli.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'alias', 'bdist_egg', 'bdist_rpm', 'build_ext', 'build_py', 'develop', 3 | 'easy_install', 'egg_info', 'install', 'install_lib', 'rotate', 'saveopts', 4 | 'sdist', 'setopt', 'test', 'install_egg_info', 'install_scripts', 5 | 'register', 'bdist_wininst', 'upload_docs', 6 | ] 7 | 8 | from distutils.command.bdist import bdist 9 | import sys 10 | 11 | from setuptools.command import install_scripts 12 | 13 | 14 | if 'egg' not in bdist.format_commands: 15 | bdist.format_command['egg'] = ('bdist_egg', "Python .egg file") 16 | bdist.format_commands.append('egg') 17 | 18 | del bdist, sys 19 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/alias.py: -------------------------------------------------------------------------------- 1 | from distutils.errors import DistutilsOptionError 2 | 3 | from setuptools.command.setopt import edit_config, option_base, config_file 4 | 5 | 6 | def shquote(arg): 7 | """Quote an argument for later parsing by shlex.split()""" 8 | for c in '"', "'", "\\", "#": 9 | if c in arg: 10 | return repr(arg) 11 | if arg.split() != [arg]: 12 | return repr(arg) 13 | return arg 14 | 15 | 16 | class alias(option_base): 17 | """Define a shortcut that invokes one or more commands""" 18 | 19 | description = "define a shortcut to invoke one or more commands" 20 | command_consumes_arguments = True 21 | 22 | user_options = [ 23 | ('remove', 'r', 'remove (unset) the alias'), 24 | ] + option_base.user_options 25 | 26 | boolean_options = option_base.boolean_options + ['remove'] 27 | 28 | def initialize_options(self): 29 | option_base.initialize_options(self) 30 | self.args = None 31 | self.remove = None 32 | 33 | def finalize_options(self): 34 | option_base.finalize_options(self) 35 | if self.remove and len(self.args) != 1: 36 | raise DistutilsOptionError( 37 | "Must specify exactly one argument (the alias name) when " 38 | "using --remove" 39 | ) 40 | 41 | def run(self): 42 | aliases = self.distribution.get_option_dict('aliases') 43 | 44 | if not self.args: 45 | print("Command Aliases") 46 | print("---------------") 47 | for alias in aliases: 48 | print("setup.py alias", format_alias(alias, aliases)) 49 | return 50 | 51 | elif len(self.args) == 1: 52 | alias, = self.args 53 | if self.remove: 54 | command = None 55 | elif alias in aliases: 56 | print("setup.py alias", format_alias(alias, aliases)) 57 | return 58 | else: 59 | print("No alias definition found for %r" % alias) 60 | return 61 | else: 62 | alias = self.args[0] 63 | command = ' '.join(map(shquote, self.args[1:])) 64 | 65 | edit_config(self.filename, {'aliases': {alias: command}}, self.dry_run) 66 | 67 | 68 | def format_alias(name, aliases): 69 | source, command = aliases[name] 70 | if source == config_file('global'): 71 | source = '--global-config ' 72 | elif source == config_file('user'): 73 | source = '--user-config ' 74 | elif source == config_file('local'): 75 | source = '' 76 | else: 77 | source = '--filename=%r' % source 78 | return source + name + ' ' + command 79 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/alias.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/alias.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/bdist_egg.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/bdist_egg.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/bdist_rpm.py: -------------------------------------------------------------------------------- 1 | import distutils.command.bdist_rpm as orig 2 | 3 | 4 | class bdist_rpm(orig.bdist_rpm): 5 | """ 6 | Override the default bdist_rpm behavior to do the following: 7 | 8 | 1. Run egg_info to ensure the name and version are properly calculated. 9 | 2. Always run 'install' using --single-version-externally-managed to 10 | disable eggs in RPM distributions. 11 | 3. Replace dash with underscore in the version numbers for better RPM 12 | compatibility. 13 | """ 14 | 15 | def run(self): 16 | # ensure distro name is up-to-date 17 | self.run_command('egg_info') 18 | 19 | orig.bdist_rpm.run(self) 20 | 21 | def _make_spec_file(self): 22 | version = self.distribution.get_version() 23 | rpmversion = version.replace('-', '_') 24 | spec = orig.bdist_rpm._make_spec_file(self) 25 | line23 = '%define version ' + version 26 | line24 = '%define version ' + rpmversion 27 | spec = [ 28 | line.replace( 29 | "Source0: %{name}-%{version}.tar", 30 | "Source0: %{name}-%{unmangled_version}.tar" 31 | ).replace( 32 | "setup.py install ", 33 | "setup.py install --single-version-externally-managed " 34 | ).replace( 35 | "%setup", 36 | "%setup -n %{name}-%{unmangled_version}" 37 | ).replace(line23, line24) 38 | for line in spec 39 | ] 40 | insert_loc = spec.index(line24) + 1 41 | unmangled_version = "%define unmangled_version " + version 42 | spec.insert(insert_loc, unmangled_version) 43 | return spec 44 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/bdist_rpm.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/bdist_rpm.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/bdist_wininst.py: -------------------------------------------------------------------------------- 1 | import distutils.command.bdist_wininst as orig 2 | 3 | 4 | class bdist_wininst(orig.bdist_wininst): 5 | def reinitialize_command(self, command, reinit_subcommands=0): 6 | """ 7 | Supplement reinitialize_command to work around 8 | http://bugs.python.org/issue20819 9 | """ 10 | cmd = self.distribution.reinitialize_command( 11 | command, reinit_subcommands) 12 | if command in ('install', 'install_lib'): 13 | cmd.install_lib = None 14 | return cmd 15 | 16 | def run(self): 17 | self._is_running = True 18 | try: 19 | orig.bdist_wininst.run(self) 20 | finally: 21 | self._is_running = False 22 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/bdist_wininst.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/bdist_wininst.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/build_ext.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/build_ext.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/build_py.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/build_py.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/develop.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/develop.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/easy_install.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/easy_install.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/egg_info.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/egg_info.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/install.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install_egg_info.py: -------------------------------------------------------------------------------- 1 | from distutils import log, dir_util 2 | import os 3 | 4 | from setuptools import Command 5 | from setuptools.archive_util import unpack_archive 6 | import pkg_resources 7 | 8 | 9 | class install_egg_info(Command): 10 | """Install an .egg-info directory for the package""" 11 | 12 | description = "Install an .egg-info directory for the package" 13 | 14 | user_options = [ 15 | ('install-dir=', 'd', "directory to install to"), 16 | ] 17 | 18 | def initialize_options(self): 19 | self.install_dir = None 20 | 21 | def finalize_options(self): 22 | self.set_undefined_options('install_lib', 23 | ('install_dir', 'install_dir')) 24 | ei_cmd = self.get_finalized_command("egg_info") 25 | basename = pkg_resources.Distribution( 26 | None, None, ei_cmd.egg_name, ei_cmd.egg_version 27 | ).egg_name() + '.egg-info' 28 | self.source = ei_cmd.egg_info 29 | self.target = os.path.join(self.install_dir, basename) 30 | self.outputs = [self.target] 31 | 32 | def run(self): 33 | self.run_command('egg_info') 34 | if os.path.isdir(self.target) and not os.path.islink(self.target): 35 | dir_util.remove_tree(self.target, dry_run=self.dry_run) 36 | elif os.path.exists(self.target): 37 | self.execute(os.unlink, (self.target,), "Removing " + self.target) 38 | if not self.dry_run: 39 | pkg_resources.ensure_directory(self.target) 40 | self.execute( 41 | self.copytree, (), "Copying %s to %s" % (self.source, self.target) 42 | ) 43 | self.install_namespaces() 44 | 45 | def get_outputs(self): 46 | return self.outputs 47 | 48 | def copytree(self): 49 | # Copy the .egg-info tree to site-packages 50 | def skimmer(src, dst): 51 | # filter out source-control directories; note that 'src' is always 52 | # a '/'-separated path, regardless of platform. 'dst' is a 53 | # platform-specific path. 54 | for skip in '.svn/', 'CVS/': 55 | if src.startswith(skip) or '/' + skip in src: 56 | return None 57 | self.outputs.append(dst) 58 | log.debug("Copying %s to %s", src, dst) 59 | return dst 60 | 61 | unpack_archive(self.source, self.target, skimmer) 62 | 63 | def install_namespaces(self): 64 | nsp = self._get_all_ns_packages() 65 | if not nsp: 66 | return 67 | filename, ext = os.path.splitext(self.target) 68 | filename += '-nspkg.pth' 69 | self.outputs.append(filename) 70 | log.info("Installing %s", filename) 71 | lines = map(self._gen_nspkg_line, nsp) 72 | 73 | if self.dry_run: 74 | # always generate the lines, even in dry run 75 | list(lines) 76 | return 77 | 78 | with open(filename, 'wt') as f: 79 | f.writelines(lines) 80 | 81 | _nspkg_tmpl = ( 82 | "import sys, types, os", 83 | "p = os.path.join(sys._getframe(1).f_locals['sitedir'], *%(pth)r)", 84 | "ie = os.path.exists(os.path.join(p,'__init__.py'))", 85 | "m = not ie and " 86 | "sys.modules.setdefault(%(pkg)r, types.ModuleType(%(pkg)r))", 87 | "mp = (m or []) and m.__dict__.setdefault('__path__',[])", 88 | "(p not in mp) and mp.append(p)", 89 | ) 90 | "lines for the namespace installer" 91 | 92 | _nspkg_tmpl_multi = ( 93 | 'm and setattr(sys.modules[%(parent)r], %(child)r, m)', 94 | ) 95 | "additional line(s) when a parent package is indicated" 96 | 97 | @classmethod 98 | def _gen_nspkg_line(cls, pkg): 99 | # ensure pkg is not a unicode string under Python 2.7 100 | pkg = str(pkg) 101 | pth = tuple(pkg.split('.')) 102 | tmpl_lines = cls._nspkg_tmpl 103 | parent, sep, child = pkg.rpartition('.') 104 | if parent: 105 | tmpl_lines += cls._nspkg_tmpl_multi 106 | return ';'.join(tmpl_lines) % locals() + '\n' 107 | 108 | def _get_all_ns_packages(self): 109 | """Return sorted list of all package namespaces""" 110 | nsp = set() 111 | for pkg in self.distribution.namespace_packages or []: 112 | pkg = pkg.split('.') 113 | while pkg: 114 | nsp.add('.'.join(pkg)) 115 | pkg.pop() 116 | return sorted(nsp) 117 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install_egg_info.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/install_egg_info.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install_lib.py: -------------------------------------------------------------------------------- 1 | import os 2 | import imp 3 | from itertools import product, starmap 4 | import distutils.command.install_lib as orig 5 | 6 | class install_lib(orig.install_lib): 7 | """Don't add compiled flags to filenames of non-Python files""" 8 | 9 | def run(self): 10 | self.build() 11 | outfiles = self.install() 12 | if outfiles is not None: 13 | # always compile, in case we have any extension stubs to deal with 14 | self.byte_compile(outfiles) 15 | 16 | def get_exclusions(self): 17 | """ 18 | Return a collections.Sized collections.Container of paths to be 19 | excluded for single_version_externally_managed installations. 20 | """ 21 | all_packages = ( 22 | pkg 23 | for ns_pkg in self._get_SVEM_NSPs() 24 | for pkg in self._all_packages(ns_pkg) 25 | ) 26 | 27 | excl_specs = product(all_packages, self._gen_exclusion_paths()) 28 | return set(starmap(self._exclude_pkg_path, excl_specs)) 29 | 30 | def _exclude_pkg_path(self, pkg, exclusion_path): 31 | """ 32 | Given a package name and exclusion path within that package, 33 | compute the full exclusion path. 34 | """ 35 | parts = pkg.split('.') + [exclusion_path] 36 | return os.path.join(self.install_dir, *parts) 37 | 38 | @staticmethod 39 | def _all_packages(pkg_name): 40 | """ 41 | >>> list(install_lib._all_packages('foo.bar.baz')) 42 | ['foo.bar.baz', 'foo.bar', 'foo'] 43 | """ 44 | while pkg_name: 45 | yield pkg_name 46 | pkg_name, sep, child = pkg_name.rpartition('.') 47 | 48 | def _get_SVEM_NSPs(self): 49 | """ 50 | Get namespace packages (list) but only for 51 | single_version_externally_managed installations and empty otherwise. 52 | """ 53 | # TODO: is it necessary to short-circuit here? i.e. what's the cost 54 | # if get_finalized_command is called even when namespace_packages is 55 | # False? 56 | if not self.distribution.namespace_packages: 57 | return [] 58 | 59 | install_cmd = self.get_finalized_command('install') 60 | svem = install_cmd.single_version_externally_managed 61 | 62 | return self.distribution.namespace_packages if svem else [] 63 | 64 | @staticmethod 65 | def _gen_exclusion_paths(): 66 | """ 67 | Generate file paths to be excluded for namespace packages (bytecode 68 | cache files). 69 | """ 70 | # always exclude the package module itself 71 | yield '__init__.py' 72 | 73 | yield '__init__.pyc' 74 | yield '__init__.pyo' 75 | 76 | if not hasattr(imp, 'get_tag'): 77 | return 78 | 79 | base = os.path.join('__pycache__', '__init__.' + imp.get_tag()) 80 | yield base + '.pyc' 81 | yield base + '.pyo' 82 | yield base + '.opt-1.pyc' 83 | yield base + '.opt-2.pyc' 84 | 85 | def copy_tree( 86 | self, infile, outfile, 87 | preserve_mode=1, preserve_times=1, preserve_symlinks=0, level=1 88 | ): 89 | assert preserve_mode and preserve_times and not preserve_symlinks 90 | exclude = self.get_exclusions() 91 | 92 | if not exclude: 93 | return orig.install_lib.copy_tree(self, infile, outfile) 94 | 95 | # Exclude namespace package __init__.py* files from the output 96 | 97 | from setuptools.archive_util import unpack_directory 98 | from distutils import log 99 | 100 | outfiles = [] 101 | 102 | def pf(src, dst): 103 | if dst in exclude: 104 | log.warn("Skipping installation of %s (namespace package)", 105 | dst) 106 | return False 107 | 108 | log.info("copying %s -> %s", src, os.path.dirname(dst)) 109 | outfiles.append(dst) 110 | return dst 111 | 112 | unpack_directory(infile, outfile, pf) 113 | return outfiles 114 | 115 | def get_outputs(self): 116 | outputs = orig.install_lib.get_outputs(self) 117 | exclude = self.get_exclusions() 118 | if exclude: 119 | return [f for f in outputs if f not in exclude] 120 | return outputs 121 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install_lib.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/install_lib.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install_scripts.py: -------------------------------------------------------------------------------- 1 | from distutils import log 2 | import distutils.command.install_scripts as orig 3 | import os 4 | 5 | from pkg_resources import Distribution, PathMetadata, ensure_directory 6 | 7 | 8 | class install_scripts(orig.install_scripts): 9 | """Do normal script install, plus any egg_info wrapper scripts""" 10 | 11 | def initialize_options(self): 12 | orig.install_scripts.initialize_options(self) 13 | self.no_ep = False 14 | 15 | def run(self): 16 | import setuptools.command.easy_install as ei 17 | 18 | self.run_command("egg_info") 19 | if self.distribution.scripts: 20 | orig.install_scripts.run(self) # run first to set up self.outfiles 21 | else: 22 | self.outfiles = [] 23 | if self.no_ep: 24 | # don't install entry point scripts into .egg file! 25 | return 26 | 27 | ei_cmd = self.get_finalized_command("egg_info") 28 | dist = Distribution( 29 | ei_cmd.egg_base, PathMetadata(ei_cmd.egg_base, ei_cmd.egg_info), 30 | ei_cmd.egg_name, ei_cmd.egg_version, 31 | ) 32 | bs_cmd = self.get_finalized_command('build_scripts') 33 | exec_param = getattr(bs_cmd, 'executable', None) 34 | bw_cmd = self.get_finalized_command("bdist_wininst") 35 | is_wininst = getattr(bw_cmd, '_is_running', False) 36 | writer = ei.ScriptWriter 37 | if is_wininst: 38 | exec_param = "python.exe" 39 | writer = ei.WindowsScriptWriter 40 | # resolve the writer to the environment 41 | writer = writer.best() 42 | cmd = writer.command_spec_class.best().from_param(exec_param) 43 | for args in writer.get_args(dist, cmd.as_header()): 44 | self.write_script(*args) 45 | 46 | def write_script(self, script_name, contents, mode="t", *ignored): 47 | """Write an executable file to the scripts directory""" 48 | from setuptools.command.easy_install import chmod, current_umask 49 | 50 | log.info("Installing %s script to %s", script_name, self.install_dir) 51 | target = os.path.join(self.install_dir, script_name) 52 | self.outfiles.append(target) 53 | 54 | mask = current_umask() 55 | if not self.dry_run: 56 | ensure_directory(target) 57 | f = open(target, "w" + mode) 58 | f.write(contents) 59 | f.close() 60 | chmod(target, 0o777 - mask) 61 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/install_scripts.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/install_scripts.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/launcher manifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/register.py: -------------------------------------------------------------------------------- 1 | import distutils.command.register as orig 2 | 3 | 4 | class register(orig.register): 5 | __doc__ = orig.register.__doc__ 6 | 7 | def run(self): 8 | # Make sure that we are using valid current name/version info 9 | self.run_command('egg_info') 10 | orig.register.run(self) 11 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/register.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/register.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/rotate.py: -------------------------------------------------------------------------------- 1 | from distutils.util import convert_path 2 | from distutils import log 3 | from distutils.errors import DistutilsOptionError 4 | import os 5 | 6 | from setuptools import Command 7 | from setuptools.compat import basestring 8 | 9 | 10 | class rotate(Command): 11 | """Delete older distributions""" 12 | 13 | description = "delete older distributions, keeping N newest files" 14 | user_options = [ 15 | ('match=', 'm', "patterns to match (required)"), 16 | ('dist-dir=', 'd', "directory where the distributions are"), 17 | ('keep=', 'k', "number of matching distributions to keep"), 18 | ] 19 | 20 | boolean_options = [] 21 | 22 | def initialize_options(self): 23 | self.match = None 24 | self.dist_dir = None 25 | self.keep = None 26 | 27 | def finalize_options(self): 28 | if self.match is None: 29 | raise DistutilsOptionError( 30 | "Must specify one or more (comma-separated) match patterns " 31 | "(e.g. '.zip' or '.egg')" 32 | ) 33 | if self.keep is None: 34 | raise DistutilsOptionError("Must specify number of files to keep") 35 | try: 36 | self.keep = int(self.keep) 37 | except ValueError: 38 | raise DistutilsOptionError("--keep must be an integer") 39 | if isinstance(self.match, basestring): 40 | self.match = [ 41 | convert_path(p.strip()) for p in self.match.split(',') 42 | ] 43 | self.set_undefined_options('bdist', ('dist_dir', 'dist_dir')) 44 | 45 | def run(self): 46 | self.run_command("egg_info") 47 | from glob import glob 48 | 49 | for pattern in self.match: 50 | pattern = self.distribution.get_name() + '*' + pattern 51 | files = glob(os.path.join(self.dist_dir, pattern)) 52 | files = [(os.path.getmtime(f), f) for f in files] 53 | files.sort() 54 | files.reverse() 55 | 56 | log.info("%d file(s) matching %s", len(files), pattern) 57 | files = files[self.keep:] 58 | for (t, f) in files: 59 | log.info("Deleting %s", f) 60 | if not self.dry_run: 61 | os.unlink(f) 62 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/rotate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/rotate.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/saveopts.py: -------------------------------------------------------------------------------- 1 | from setuptools.command.setopt import edit_config, option_base 2 | 3 | 4 | class saveopts(option_base): 5 | """Save command-line options to a file""" 6 | 7 | description = "save supplied options to setup.cfg or other config file" 8 | 9 | def run(self): 10 | dist = self.distribution 11 | settings = {} 12 | 13 | for cmd in dist.command_options: 14 | 15 | if cmd == 'saveopts': 16 | continue # don't save our own options! 17 | 18 | for opt, (src, val) in dist.get_option_dict(cmd).items(): 19 | if src == "command line": 20 | settings.setdefault(cmd, {})[opt] = val 21 | 22 | edit_config(self.filename, settings, self.dry_run) 23 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/saveopts.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/saveopts.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/sdist.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/sdist.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/setopt.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/setopt.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/test.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/test.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/command/upload_docs.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/command/upload_docs.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/compat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import itertools 3 | 4 | PY3 = sys.version_info >= (3,) 5 | PY2 = not PY3 6 | 7 | if PY2: 8 | basestring = basestring 9 | import __builtin__ as builtins 10 | import ConfigParser as configparser 11 | from StringIO import StringIO 12 | BytesIO = StringIO 13 | func_code = lambda o: o.func_code 14 | func_globals = lambda o: o.func_globals 15 | im_func = lambda o: o.im_func 16 | from htmlentitydefs import name2codepoint 17 | import httplib 18 | from BaseHTTPServer import HTTPServer 19 | from SimpleHTTPServer import SimpleHTTPRequestHandler 20 | from BaseHTTPServer import BaseHTTPRequestHandler 21 | iteritems = lambda o: o.iteritems() 22 | long_type = long 23 | maxsize = sys.maxint 24 | unichr = unichr 25 | unicode = unicode 26 | bytes = str 27 | from urllib import url2pathname, splittag, pathname2url 28 | import urllib2 29 | from urllib2 import urlopen, HTTPError, URLError, unquote, splituser 30 | from urlparse import urlparse, urlunparse, urljoin, urlsplit, urlunsplit 31 | filterfalse = itertools.ifilterfalse 32 | filter = itertools.ifilter 33 | map = itertools.imap 34 | 35 | exec("""def reraise(tp, value, tb=None): 36 | raise tp, value, tb""") 37 | 38 | if PY3: 39 | basestring = str 40 | import builtins 41 | import configparser 42 | from io import StringIO, BytesIO 43 | func_code = lambda o: o.__code__ 44 | func_globals = lambda o: o.__globals__ 45 | im_func = lambda o: o.__func__ 46 | from html.entities import name2codepoint 47 | import http.client as httplib 48 | from http.server import HTTPServer, SimpleHTTPRequestHandler 49 | from http.server import BaseHTTPRequestHandler 50 | iteritems = lambda o: o.items() 51 | long_type = int 52 | maxsize = sys.maxsize 53 | unichr = chr 54 | unicode = str 55 | bytes = bytes 56 | from urllib.error import HTTPError, URLError 57 | import urllib.request as urllib2 58 | from urllib.request import urlopen, url2pathname, pathname2url 59 | from urllib.parse import ( 60 | urlparse, urlunparse, unquote, splituser, urljoin, urlsplit, 61 | urlunsplit, splittag, 62 | ) 63 | filterfalse = itertools.filterfalse 64 | filter = filter 65 | map = map 66 | 67 | def reraise(tp, value, tb=None): 68 | if value.__traceback__ is not tb: 69 | raise value.with_traceback(tb) 70 | raise value 71 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/compat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/compat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/depends.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/depends.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/dist.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/dist.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/extension.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | import functools 4 | import distutils.core 5 | import distutils.errors 6 | import distutils.extension 7 | 8 | from .dist import _get_unpatched 9 | from . import msvc9_support 10 | 11 | _Extension = _get_unpatched(distutils.core.Extension) 12 | 13 | msvc9_support.patch_for_specialized_compiler() 14 | 15 | def _have_cython(): 16 | """ 17 | Return True if Cython can be imported. 18 | """ 19 | cython_impl = 'Cython.Distutils.build_ext', 20 | try: 21 | # from (cython_impl) import build_ext 22 | __import__(cython_impl, fromlist=['build_ext']).build_ext 23 | return True 24 | except Exception: 25 | pass 26 | return False 27 | 28 | # for compatibility 29 | have_pyrex = _have_cython 30 | 31 | 32 | class Extension(_Extension): 33 | """Extension that uses '.c' files in place of '.pyx' files""" 34 | 35 | def _convert_pyx_sources_to_lang(self): 36 | """ 37 | Replace sources with .pyx extensions to sources with the target 38 | language extension. This mechanism allows language authors to supply 39 | pre-converted sources but to prefer the .pyx sources. 40 | """ 41 | if _have_cython(): 42 | # the build has Cython, so allow it to compile the .pyx files 43 | return 44 | lang = self.language or '' 45 | target_ext = '.cpp' if lang.lower() == 'c++' else '.c' 46 | sub = functools.partial(re.sub, '.pyx$', target_ext) 47 | self.sources = list(map(sub, self.sources)) 48 | 49 | class Library(Extension): 50 | """Just like a regular Extension, but built as a library instead""" 51 | 52 | distutils.core.Extension = Extension 53 | distutils.extension.Extension = Extension 54 | if 'distutils.command.build_ext' in sys.modules: 55 | sys.modules['distutils.command.build_ext'].Extension = Extension 56 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/extension.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/extension.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/gui-32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/gui-32.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/gui-64.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/gui-64.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/gui-arm-32.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/gui-arm-32.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/gui.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/gui.exe -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/lib2to3_ex.py: -------------------------------------------------------------------------------- 1 | """ 2 | Customized Mixin2to3 support: 3 | 4 | - adds support for converting doctests 5 | 6 | 7 | This module raises an ImportError on Python 2. 8 | """ 9 | 10 | from distutils.util import Mixin2to3 as _Mixin2to3 11 | from distutils import log 12 | from lib2to3.refactor import RefactoringTool, get_fixers_from_package 13 | import setuptools 14 | 15 | class DistutilsRefactoringTool(RefactoringTool): 16 | def log_error(self, msg, *args, **kw): 17 | log.error(msg, *args) 18 | 19 | def log_message(self, msg, *args): 20 | log.info(msg, *args) 21 | 22 | def log_debug(self, msg, *args): 23 | log.debug(msg, *args) 24 | 25 | class Mixin2to3(_Mixin2to3): 26 | def run_2to3(self, files, doctests = False): 27 | # See of the distribution option has been set, otherwise check the 28 | # setuptools default. 29 | if self.distribution.use_2to3 is not True: 30 | return 31 | if not files: 32 | return 33 | log.info("Fixing "+" ".join(files)) 34 | self.__build_fixer_names() 35 | self.__exclude_fixers() 36 | if doctests: 37 | if setuptools.run_2to3_on_doctests: 38 | r = DistutilsRefactoringTool(self.fixer_names) 39 | r.refactor(files, write=True, doctests_only=True) 40 | else: 41 | _Mixin2to3.run_2to3(self, files) 42 | 43 | def __build_fixer_names(self): 44 | if self.fixer_names: return 45 | self.fixer_names = [] 46 | for p in setuptools.lib2to3_fixer_packages: 47 | self.fixer_names.extend(get_fixers_from_package(p)) 48 | if self.distribution.use_2to3_fixers is not None: 49 | for p in self.distribution.use_2to3_fixers: 50 | self.fixer_names.extend(get_fixers_from_package(p)) 51 | 52 | def __exclude_fixers(self): 53 | excluded_fixers = getattr(self, 'exclude_fixers', []) 54 | if self.distribution.use_2to3_exclude_fixers is not None: 55 | excluded_fixers.extend(self.distribution.use_2to3_exclude_fixers) 56 | for fixer_name in excluded_fixers: 57 | if fixer_name in self.fixer_names: 58 | self.fixer_names.remove(fixer_name) 59 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/lib2to3_ex.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/lib2to3_ex.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/msvc9_support.py: -------------------------------------------------------------------------------- 1 | try: 2 | import distutils.msvc9compiler 3 | except ImportError: 4 | pass 5 | 6 | unpatched = dict() 7 | 8 | def patch_for_specialized_compiler(): 9 | """ 10 | Patch functions in distutils.msvc9compiler to use the standalone compiler 11 | build for Python (Windows only). Fall back to original behavior when the 12 | standalone compiler is not available. 13 | """ 14 | if 'distutils' not in globals(): 15 | # The module isn't available to be patched 16 | return 17 | 18 | if unpatched: 19 | # Already patched 20 | return 21 | 22 | unpatched.update(vars(distutils.msvc9compiler)) 23 | 24 | distutils.msvc9compiler.find_vcvarsall = find_vcvarsall 25 | distutils.msvc9compiler.query_vcvarsall = query_vcvarsall 26 | 27 | def find_vcvarsall(version): 28 | Reg = distutils.msvc9compiler.Reg 29 | VC_BASE = r'Software\%sMicrosoft\DevDiv\VCForPython\%0.1f' 30 | key = VC_BASE % ('', version) 31 | try: 32 | # Per-user installs register the compiler path here 33 | productdir = Reg.get_value(key, "installdir") 34 | except KeyError: 35 | try: 36 | # All-user installs on a 64-bit system register here 37 | key = VC_BASE % ('Wow6432Node\\', version) 38 | productdir = Reg.get_value(key, "installdir") 39 | except KeyError: 40 | productdir = None 41 | 42 | if productdir: 43 | import os 44 | vcvarsall = os.path.join(productdir, "vcvarsall.bat") 45 | if os.path.isfile(vcvarsall): 46 | return vcvarsall 47 | 48 | return unpatched['find_vcvarsall'](version) 49 | 50 | def query_vcvarsall(version, *args, **kwargs): 51 | try: 52 | return unpatched['query_vcvarsall'](version, *args, **kwargs) 53 | except distutils.errors.DistutilsPlatformError as exc: 54 | if exc and "vcvarsall.bat" in exc.args[0]: 55 | message = 'Microsoft Visual C++ %0.1f is required (%s).' % (version, exc.args[0]) 56 | if int(version) == 9: 57 | # This redirection link is maintained by Microsoft. 58 | # Contact vspython@microsoft.com if it needs updating. 59 | raise distutils.errors.DistutilsPlatformError( 60 | message + ' Get it from http://aka.ms/vcpython27' 61 | ) 62 | raise distutils.errors.DistutilsPlatformError(message) 63 | raise 64 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/msvc9_support.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/msvc9_support.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/package_index.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/package_index.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/py26compat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility Support for Python 2.6 and earlier 3 | """ 4 | 5 | import sys 6 | 7 | from setuptools.compat import splittag 8 | 9 | def strip_fragment(url): 10 | """ 11 | In `Python 8280 `_, Python 2.7 and 12 | later was patched to disregard the fragment when making URL requests. 13 | Do the same for Python 2.6 and earlier. 14 | """ 15 | url, fragment = splittag(url) 16 | return url 17 | 18 | if sys.version_info >= (2,7): 19 | strip_fragment = lambda x: x 20 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/py26compat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/py26compat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/py27compat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Compatibility Support for Python 2.7 and earlier 3 | """ 4 | 5 | import sys 6 | 7 | def get_all_headers(message, key): 8 | """ 9 | Given an HTTPMessage, return all headers matching a given key. 10 | """ 11 | return message.get_all(key) 12 | 13 | if sys.version_info < (3,): 14 | def get_all_headers(message, key): 15 | return message.getheaders(key) 16 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/py27compat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/py27compat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/py31compat.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import unittest 3 | 4 | __all__ = ['get_config_vars', 'get_path'] 5 | 6 | try: 7 | # Python 2.7 or >=3.2 8 | from sysconfig import get_config_vars, get_path 9 | except ImportError: 10 | from distutils.sysconfig import get_config_vars, get_python_lib 11 | def get_path(name): 12 | if name not in ('platlib', 'purelib'): 13 | raise ValueError("Name must be purelib or platlib") 14 | return get_python_lib(name=='platlib') 15 | 16 | try: 17 | # Python >=3.2 18 | from tempfile import TemporaryDirectory 19 | except ImportError: 20 | import shutil 21 | import tempfile 22 | class TemporaryDirectory(object): 23 | """ 24 | Very simple temporary directory context manager. 25 | Will try to delete afterward, but will also ignore OS and similar 26 | errors on deletion. 27 | """ 28 | def __init__(self): 29 | self.name = None # Handle mkdtemp raising an exception 30 | self.name = tempfile.mkdtemp() 31 | 32 | def __enter__(self): 33 | return self.name 34 | 35 | def __exit__(self, exctype, excvalue, exctrace): 36 | try: 37 | shutil.rmtree(self.name, True) 38 | except OSError: #removal errors are not the only possible 39 | pass 40 | self.name = None 41 | 42 | 43 | unittest_main = unittest.main 44 | 45 | _PY31 = (3, 1) <= sys.version_info[:2] < (3, 2) 46 | if _PY31: 47 | # on Python 3.1, translate testRunner==None to TextTestRunner 48 | # for compatibility with Python 2.6, 2.7, and 3.2+ 49 | def unittest_main(*args, **kwargs): 50 | if 'testRunner' in kwargs and kwargs['testRunner'] is None: 51 | kwargs['testRunner'] = unittest.TextTestRunner 52 | return unittest.main(*args, **kwargs) 53 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/py31compat.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/py31compat.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/sandbox.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/sandbox.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/script (dev).tmpl: -------------------------------------------------------------------------------- 1 | # EASY-INSTALL-DEV-SCRIPT: %(spec)r,%(script_name)r 2 | __requires__ = %(spec)r 3 | __import__('pkg_resources').require(%(spec)r) 4 | __file__ = %(dev_path)r 5 | exec(compile(open(__file__).read(), __file__, 'exec')) 6 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/script.tmpl: -------------------------------------------------------------------------------- 1 | # EASY-INSTALL-SCRIPT: %(spec)r,%(script_name)r 2 | __requires__ = %(spec)r 3 | __import__('pkg_resources').run_script(%(spec)r, %(script_name)r) 4 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/site-patch.py: -------------------------------------------------------------------------------- 1 | def __boot(): 2 | import sys 3 | import os 4 | PYTHONPATH = os.environ.get('PYTHONPATH') 5 | if PYTHONPATH is None or (sys.platform=='win32' and not PYTHONPATH): 6 | PYTHONPATH = [] 7 | else: 8 | PYTHONPATH = PYTHONPATH.split(os.pathsep) 9 | 10 | pic = getattr(sys,'path_importer_cache',{}) 11 | stdpath = sys.path[len(PYTHONPATH):] 12 | mydir = os.path.dirname(__file__) 13 | #print "searching",stdpath,sys.path 14 | 15 | for item in stdpath: 16 | if item==mydir or not item: 17 | continue # skip if current dir. on Windows, or my own directory 18 | importer = pic.get(item) 19 | if importer is not None: 20 | loader = importer.find_module('site') 21 | if loader is not None: 22 | # This should actually reload the current module 23 | loader.load_module('site') 24 | break 25 | else: 26 | try: 27 | import imp # Avoid import loop in Python >= 3.3 28 | stream, path, descr = imp.find_module('site',[item]) 29 | except ImportError: 30 | continue 31 | if stream is None: 32 | continue 33 | try: 34 | # This should actually reload the current module 35 | imp.load_module('site',stream,path,descr) 36 | finally: 37 | stream.close() 38 | break 39 | else: 40 | raise ImportError("Couldn't find the real 'site' module") 41 | 42 | #print "loaded", __file__ 43 | 44 | known_paths = dict([(makepath(item)[1],1) for item in sys.path]) # 2.2 comp 45 | 46 | oldpos = getattr(sys,'__egginsert',0) # save old insertion position 47 | sys.__egginsert = 0 # and reset the current one 48 | 49 | for item in PYTHONPATH: 50 | addsitedir(item) 51 | 52 | sys.__egginsert += oldpos # restore effective old position 53 | 54 | d, nd = makepath(stdpath[0]) 55 | insert_at = None 56 | new_path = [] 57 | 58 | for item in sys.path: 59 | p, np = makepath(item) 60 | 61 | if np==nd and insert_at is None: 62 | # We've hit the first 'system' path entry, so added entries go here 63 | insert_at = len(new_path) 64 | 65 | if np in known_paths or insert_at is None: 66 | new_path.append(item) 67 | else: 68 | # new path after the insert point, back-insert it 69 | new_path.insert(insert_at, item) 70 | insert_at += 1 71 | 72 | sys.path[:] = new_path 73 | 74 | if __name__=='site': 75 | __boot() 76 | del __boot 77 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/site-patch.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/site-patch.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/ssl_support.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/ssl_support.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/unicode_utils.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | import sys 3 | from setuptools.compat import unicode as decoded_string 4 | 5 | 6 | # HFS Plus uses decomposed UTF-8 7 | def decompose(path): 8 | if isinstance(path, decoded_string): 9 | return unicodedata.normalize('NFD', path) 10 | try: 11 | path = path.decode('utf-8') 12 | path = unicodedata.normalize('NFD', path) 13 | path = path.encode('utf-8') 14 | except UnicodeError: 15 | pass # Not UTF-8 16 | return path 17 | 18 | 19 | def filesys_decode(path): 20 | """ 21 | Ensure that the given path is decoded, 22 | NONE when no expected encoding works 23 | """ 24 | 25 | fs_enc = sys.getfilesystemencoding() 26 | if isinstance(path, decoded_string): 27 | return path 28 | 29 | for enc in (fs_enc, "utf-8"): 30 | try: 31 | return path.decode(enc) 32 | except UnicodeDecodeError: 33 | continue 34 | 35 | 36 | def try_encode(string, enc): 37 | "turn unicode encoding into a functional routine" 38 | try: 39 | return string.encode(enc) 40 | except UnicodeEncodeError: 41 | return None 42 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/unicode_utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/unicode_utils.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | 4 | 5 | def cs_path_exists(fspath): 6 | if not os.path.exists(fspath): 7 | return False 8 | # make absolute so we always have a directory 9 | abspath = os.path.abspath(fspath) 10 | directory, filename = os.path.split(abspath) 11 | return filename in os.listdir(directory) -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/utils.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/version.py: -------------------------------------------------------------------------------- 1 | __version__ = '19.2' 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/version.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/version.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/windows_support.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import ctypes 3 | 4 | 5 | def windows_only(func): 6 | if platform.system() != 'Windows': 7 | return lambda *args, **kwargs: None 8 | return func 9 | 10 | 11 | @windows_only 12 | def hide_file(path): 13 | """ 14 | Set the hidden attribute on a file or directory. 15 | 16 | From http://stackoverflow.com/questions/19622133/ 17 | 18 | `path` must be text. 19 | """ 20 | __import__('ctypes.wintypes') 21 | SetFileAttributes = ctypes.windll.kernel32.SetFileAttributesW 22 | SetFileAttributes.argtypes = ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD 23 | SetFileAttributes.restype = ctypes.wintypes.BOOL 24 | 25 | FILE_ATTRIBUTE_HIDDEN = 0x02 26 | 27 | ret = SetFileAttributes(path, FILE_ATTRIBUTE_HIDDEN) 28 | if not ret: 29 | raise ctypes.WinError() 30 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/setuptools/windows_support.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/setuptools/windows_support.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate-0.3-py2.7.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 1.1 2 | Name: slate 3 | Version: 0.3 4 | Summary: Extract text from PDF documents easily. 5 | Home-page: http://github.com/timClicks/slate 6 | Author: Tim McNamara 7 | Author-email: paperless@timmcnamara.co.nz 8 | License: GPL v3 or later 9 | Description: ====================================================== 10 | slate: the easiest way to get text from PDFs in Python 11 | ====================================================== 12 | 13 | 14 | Slate is a Python package that simplifies the process of extracting 15 | text from PDF files. It depends on the PDFMiner package. 16 | 17 | Slate provides one class, PDF. PDF takes a file-like object and 18 | will extract all text from the document, presentating each page 19 | as a string of text: 20 | 21 | >>> with open('example.pdf') as f: 22 | ... doc = slate.PDF(f) 23 | ... 24 | >>> doc 25 | [..., ..., ...] 26 | >>> doc[1] 27 | 'Text from page 2...' 28 | 29 | If your pdf is password protected, pass the password as the 30 | second argument: 31 | 32 | >>> with open('secrets.pdf') as f: 33 | ... doc = slate.PDF(f, 'password') 34 | ... 35 | >>> doc[0] 36 | "My mother doesn't know this, but..." 37 | 38 | More complex operations 39 | ----------------------- 40 | 41 | If you would like access to the images, font files and other 42 | information, then take some time to learn the PDFMiner API. 43 | 44 | 45 | What is wrong with PDFMiner? 46 | ---------------------------- 47 | 48 | 1. Getting simple things done, like extracting the text 49 | is quite complex. The program is not designed to return 50 | Python objects, which makes interfacing things irritating. 51 | 2. It's an extremely complete set of tools, with multiple 52 | and moderately steep learning curves. 53 | 3. It's not written with hackability in mind. 54 | 55 | 56 | GPL? 57 | ---- 58 | 59 | If you would like to use this software in a non-free manner, 60 | please contact the copyright owner. 61 | 62 | Keywords: pdf,text,text-extraction 63 | Platform: UNKNOWN 64 | Classifier: Development Status :: 4 - Beta 65 | Classifier: Intended Audience :: Developers 66 | Classifier: License :: OSI Approved :: GNU General Public License (GPL) 67 | Classifier: Programming Language :: Python 68 | Classifier: Programming Language :: Python :: 2.6 69 | Classifier: Topic :: Office/Business 70 | Classifier: Topic :: Software Development :: Libraries :: Python Modules 71 | Classifier: Topic :: Text Processing 72 | Classifier: Topic :: Utilities 73 | Requires: pdfminer 74 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate-0.3-py2.7.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | .gitignore 2 | LICENSE 3 | README 4 | setup.cfg 5 | setup.py 6 | src/slate/__init__.py 7 | src/slate/conftest.py 8 | src/slate/slate.py 9 | src/slate/test_slate.py 10 | src/slate/utils.py 11 | src/slate.egg-info/PKG-INFO 12 | src/slate.egg-info/SOURCES.txt 13 | src/slate.egg-info/dependency_links.txt 14 | src/slate.egg-info/requires.txt 15 | src/slate.egg-info/top_level.txt -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate-0.3-py2.7.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate-0.3-py2.7.egg-info/installed-files.txt: -------------------------------------------------------------------------------- 1 | ../slate/__init__.py 2 | ../slate/conftest.py 3 | ../slate/slate.py 4 | ../slate/test_slate.py 5 | ../slate/utils.py 6 | ../slate/__init__.pyc 7 | ../slate/conftest.pyc 8 | ../slate/slate.pyc 9 | ../slate/test_slate.pyc 10 | ../slate/utils.pyc 11 | ./ 12 | dependency_links.txt 13 | PKG-INFO 14 | requires.txt 15 | SOURCES.txt 16 | top_level.txt 17 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate-0.3-py2.7.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | distribute 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate-0.3-py2.7.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | slate 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/env/bin python 2 | 3 | """ 4 | slate provides a convenient interface to PDFMiner[1]. 5 | 6 | Intializing a slate.PDF object will provide you with 7 | the text from the source file as a list of pages. So, 8 | a five page PDF file will have a range of 0-4. 9 | 10 | >>> with open('example.pdf', 'rb') as f: 11 | ... PDF(f) 12 | ... 13 | [..., ..., ..., ...] 14 | 15 | Beware of page numbers. slate.PDF objects start at 0. 16 | 17 | >>> with open('example.pdf', 'rb') as f: 18 | ... doc = PDF(f) 19 | ... 20 | >>> doc[2] 21 | "Hello, I'm page three." 22 | 23 | Passwords are supported. Use them as the second argument 24 | of your intialization. Currently, UTF-8 encoding is 25 | hard-coded. If you would like to access more advanced 26 | features, you should take a look at the PDFMiner API[2]. 27 | 28 | 29 | [1] http://www.unixuser.org/~euske/python/pdfminer/index.html 30 | [2] http://www.unixuser.org/~euske/python/pdfminer/programming.html 31 | """ 32 | 33 | #This file is part of slate. 34 | 35 | #slate is free software: you can redistribute it and/or modify 36 | #it under the terms of the GNU General Public License as published by 37 | #the Free Software Foundation, either version 3 of the License, or 38 | #(at your option) any later version. 39 | 40 | #slate is distributed in the hope that it will be useful, 41 | #but WITHOUT ANY WARRANTY; without even the implied warranty of 42 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 43 | #GNU General Public License for more details. 44 | 45 | #You should have received a copy of the GNU General Public License 46 | #along with slate. If not, see . 47 | 48 | from slate import PDF 49 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/slate/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/conftest.py: -------------------------------------------------------------------------------- 1 | option_doctestmodules = True 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/conftest.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/slate/conftest.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/slate.py: -------------------------------------------------------------------------------- 1 | from StringIO import StringIO 2 | 3 | from pdfminer.pdfparser import PDFParser, PDFDocument 4 | from pdfminer.pdfinterp import PDFResourceManager 5 | from pdfminer.pdfinterp import PDFPageInterpreter as PI 6 | from pdfminer.pdfdevice import PDFDevice 7 | from pdfminer.converter import TextConverter 8 | 9 | import utils 10 | 11 | __all__ = ['PDF'] 12 | 13 | class PDFPageInterpreter(PI): 14 | def process_page(self, page): 15 | if 1 <= self.debug: 16 | print >>stderr, 'Processing page: %r' % page 17 | (x0,y0,x1,y1) = page.mediabox 18 | if page.rotate == 90: 19 | ctm = (0,-1,1,0, -y0,x1) 20 | elif page.rotate == 180: 21 | ctm = (-1,0,0,-1, x1,y1) 22 | elif page.rotate == 270: 23 | ctm = (0,1,-1,0, y1,-x0) 24 | else: 25 | ctm = (1,0,0,1, -x0,-y0) 26 | self.device.outfp.seek(0) 27 | self.device.outfp.buf = '' 28 | self.device.begin_page(page, ctm) 29 | self.render_contents(page.resources, page.contents, ctm=ctm) 30 | self.device.end_page(page) 31 | return self.device.outfp.getvalue() 32 | 33 | class PDF(list): 34 | def __init__(self, file, password='', just_text=1): 35 | self.parser = PDFParser(file) 36 | self.doc = PDFDocument() 37 | self.parser.set_document(self.doc) 38 | self.doc.set_parser(self.parser) 39 | self.doc.initialize(password) 40 | if self.doc.is_extractable: 41 | self.resmgr = PDFResourceManager() 42 | self.device = TextConverter(self.resmgr, outfp=StringIO()) 43 | self.interpreter = PDFPageInterpreter( 44 | self.resmgr, self.device) 45 | for page in self.doc.get_pages(): 46 | self.append(self.interpreter.process_page(page)) 47 | self.metadata = self.doc.info 48 | if just_text: 49 | self._cleanup() 50 | 51 | def _cleanup(self): 52 | """ 53 | Frees lots of non-textual information, such as the fonts 54 | and images and the objects that were needed to parse the 55 | PDF. 56 | """ 57 | del self.device 58 | del self.doc 59 | del self.parser 60 | del self.resmgr 61 | del self.interpreter 62 | 63 | def text(self, clean=True): 64 | """ 65 | Returns the text of the PDF as a single string. 66 | Options: 67 | 68 | :clean: 69 | Removes misc cruft, like lots of whitespace. 70 | """ 71 | if clean: 72 | return ''.join(utils.trim_whitespace(page) for page in self) 73 | else: 74 | return ''.join(self) 75 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/slate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/slate/slate.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/test_slate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for slate 3 | http://pypi.python.org/slate 4 | 5 | Expected to be used with py.test: 6 | http://codespeak.net/py/dist/test/index.html 7 | """ 8 | 9 | from slate import PDF 10 | 11 | def pytest_funcarg__doc(request): 12 | with open('basic.pdf', 'rb') as f: 13 | return PDF(f) 14 | 15 | def pytest_funcarg__passwd(request): 16 | with open('passwd-a.pdf') as f: 17 | return PDF(f, 'a') 18 | 19 | def test_basic(doc): 20 | assert doc[0] == 'This is a test.\x0c' 21 | 22 | def test_metadata_extraction(doc): 23 | assert doc.metadata 24 | 25 | def test_text_method(doc): 26 | assert doc.text() == "This is a test." 27 | 28 | def test_text_method_unclean(doc): 29 | assert '\x0c' in doc.text(clean=0) 30 | 31 | def test_password(passwd): 32 | assert passwd[0] == "Chamber of secrets.\x0c" 33 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/test_slate.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/slate/test_slate.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/utils.py: -------------------------------------------------------------------------------- 1 | def trim_whitespace(s): 2 | """ 3 | Returns a string that has at most one whitespace 4 | character between non-whitespace characters. 5 | 6 | >>> trim_whitespace(' hi there') 7 | 'hi there' 8 | """ 9 | buffer = '' 10 | for i, letter in enumerate(s): 11 | if letter.isspace(): 12 | try: 13 | if s[i+1].isspace(): continue 14 | except IndexError: 15 | pass 16 | buffer = buffer + letter 17 | 18 | return buffer.strip() 19 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/slate/utils.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/slate/utils.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3-1.14.dist-info/RECORD: -------------------------------------------------------------------------------- 1 | urllib3/__init__.py,sha256=JaHe7rKmNZ1mIiTsBOnVTCVqUVNEQqpegwlQNX9UbmA,2645 2 | urllib3/_collections.py,sha256=8G9PhO4XdkNDQATNL1uy86tSlH3EvIJHXebiOJnfFok,10542 3 | urllib3/connection.py,sha256=XREoqqZh54Lgag5CLdVlC27bwCpOq0aYrMmNEMtSJWk,10286 4 | urllib3/connectionpool.py,sha256=2J7aN994G8Jeppnrl8eOnEpha3QhBFk-5CE5ldsjwkk,31137 5 | urllib3/exceptions.py,sha256=zGjhZCR1wefEnCN5b7WouQ3UhXesJ2bRKYIeWusaFJs,5599 6 | urllib3/fields.py,sha256=WVUvPfSzNBniw9zKVDoLl9y5ko2qKBjbzkH-bTQMSgQ,5872 7 | urllib3/filepost.py,sha256=NvLlFsdt8ih_Q4S2ekQF3CJG0nOXs32YI-G04_AdT2g,2320 8 | urllib3/poolmanager.py,sha256=W09uewCGoKSzezei0DwaTXT7kuvsF2elO2wUXWfiAco,9614 9 | urllib3/request.py,sha256=jET7OvA3FSjxABBRGhCyMdPvM9XuJA6df9gRhkJiJiY,5988 10 | urllib3/response.py,sha256=6Bs5LNzhW1YEEd6stBFJtruDVFMlWNxo0MFPmtJhvDU,18103 11 | urllib3/contrib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0 12 | urllib3/contrib/appengine.py,sha256=VP10uoVbNpH0kYVbOFd7dN5dtDcVfEytMoriKsDBBuI,7938 13 | urllib3/contrib/ntlmpool.py,sha256=r-vMDMXAGbix9a7-IhbKVTATmAst-5g4hKYOLf8Kd5M,4531 14 | urllib3/contrib/pyopenssl.py,sha256=JBL3GO8YucHXkdpU7uxUGd9UgShsIhAU8oCMJDOo47s,10094 15 | urllib3/contrib/socks.py,sha256=hE8u1190XTNSE_HAtTxwpISa-jnDbpIA1ozlZoIq_Fg,5705 16 | urllib3/packages/__init__.py,sha256=nlChrGzkjCkmhCX9HrF_qHPUgosfsPQkVIJxiiLhk9g,109 17 | urllib3/packages/ordered_dict.py,sha256=VQaPONfhVMsb8B63Xg7ZOydJqIE_jzeMhVN3Pec6ogw,8935 18 | urllib3/packages/six.py,sha256=U-rO-WBrFS8PxHeamSl6okKCjqPF18NhiZb0qPZ67XM,11628 19 | urllib3/packages/ssl_match_hostname/__init__.py,sha256=cOWMIn1orgJoA35p6pSzO_-Dc6iOX9Dhl6D2sL9b_2o,460 20 | urllib3/packages/ssl_match_hostname/_implementation.py,sha256=fK28k37hL7-D79v9iM2fHgNK9Q1Pw0M7qVRL4rkfFjQ,3778 21 | urllib3/util/__init__.py,sha256=7LnyUDyddbD9VVmsbPP0ckT2paVTmgLPs5E_BUoHVu8,854 22 | urllib3/util/connection.py,sha256=6PvDBlK_6QDLHzEDT-uEMhqKcDoSuRO43Vtb4IXfkzQ,3380 23 | urllib3/util/request.py,sha256=ZMDewRK-mjlK72szGIIjzYnLIn-zPP0WgJUMjKeZ6Tg,2128 24 | urllib3/util/response.py,sha256=1UFd5TIp9MyBp4xgnZoyQZscZVPPr0tWRaXNR5w_vds,2165 25 | urllib3/util/retry.py,sha256=EC10NTVcyHOWzBlyKynLvr5ZgghcfwA-rjH4P2_RNE0,9975 26 | urllib3/util/ssl_.py,sha256=bm46-ql6Wq6ulhJw604iBTG16QHDzHB03cbLyvlIXq4,11464 27 | urllib3/util/timeout.py,sha256=ioAIYptFyBG7eU_r8_ZmO45hpj1dJE6WCvrGR9dNFjs,9596 28 | urllib3/util/url.py,sha256=EcX4ZfmgKWcqM4sY9FlC-yN4y_snuURPV0TpUPHNjnc,5879 29 | urllib3-1.14.dist-info/DESCRIPTION.rst,sha256=hud3mTd3qo3nF7giyoJMlmRXK-W1bu4uXySgqd0wkv8,24151 30 | urllib3-1.14.dist-info/METADATA,sha256=1BeC8yJkDQYsjh332O00jkPtPdMTbf53-Rvxy78cavE,25190 31 | urllib3-1.14.dist-info/metadata.json,sha256=zHDlvIbjCPUvbbOBrqhbF6xI8TcEKZjUI2quKMm1Etg,1178 32 | urllib3-1.14.dist-info/pbr.json,sha256=Bp61WOs2E_Dy1arJqQSN7uuuWVh-ZwEhoBsNUlxup_k,47 33 | urllib3-1.14.dist-info/RECORD,, 34 | urllib3-1.14.dist-info/top_level.txt,sha256=EMiXL2sKrTcmrMxIHTqdc3ET54pQI2Y072LexFEemvo,8 35 | urllib3-1.14.dist-info/WHEEL,sha256=AvR0WeTpDaxT645bl5FQxUK6NPsTls2ttpcGJg3j1Xg,110 36 | urllib3/filepost.pyc,, 37 | urllib3/contrib/__init__.pyc,, 38 | urllib3/util/response.pyc,, 39 | urllib3/packages/ordered_dict.pyc,, 40 | urllib3/packages/__init__.pyc,, 41 | urllib3/connection.pyc,, 42 | urllib3/connectionpool.pyc,, 43 | urllib3/util/timeout.pyc,, 44 | urllib3/packages/ssl_match_hostname/__init__.pyc,, 45 | urllib3/fields.pyc,, 46 | urllib3/util/__init__.pyc,, 47 | urllib3/response.pyc,, 48 | urllib3/packages/six.pyc,, 49 | urllib3/__init__.pyc,, 50 | urllib3/contrib/ntlmpool.pyc,, 51 | urllib3/poolmanager.pyc,, 52 | urllib3/contrib/pyopenssl.pyc,, 53 | urllib3/util/ssl_.pyc,, 54 | urllib3/util/request.pyc,, 55 | urllib3/packages/ssl_match_hostname/_implementation.pyc,, 56 | urllib3/request.pyc,, 57 | urllib3/util/connection.pyc,, 58 | urllib3/contrib/appengine.pyc,, 59 | urllib3/contrib/socks.pyc,, 60 | urllib3/exceptions.pyc,, 61 | urllib3/util/retry.pyc,, 62 | urllib3/_collections.pyc,, 63 | urllib3/util/url.pyc,, 64 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3-1.14.dist-info/WHEEL: -------------------------------------------------------------------------------- 1 | Wheel-Version: 1.0 2 | Generator: bdist_wheel (0.24.0) 3 | Root-Is-Purelib: true 4 | Tag: py2-none-any 5 | Tag: py3-none-any 6 | 7 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3-1.14.dist-info/metadata.json: -------------------------------------------------------------------------------- 1 | {"license": "MIT", "name": "urllib3", "metadata_version": "2.0", "generator": "bdist_wheel (0.24.0)", "test_requires": [{"requires": ["nose", "mock", "tornado"]}], "summary": "HTTP library with thread-safe connection pooling, file post, and more.", "run_requires": [{"requires": ["certifi"], "extra": "secure"}, {"environment": "python_version<=\"2.7\"", "requires": ["pyOpenSSL>=0.13", "ndg-httpsclient", "pyasn1"], "extra": "secure"}], "version": "1.14", "extensions": {"python.details": {"project_urls": {"Home": "http://urllib3.readthedocs.org/"}, "document_names": {"description": "DESCRIPTION.rst"}, "contacts": [{"role": "author", "email": "andrey.petrov@shazow.net", "name": "Andrey Petrov"}]}}, "keywords": ["urllib", "httplib", "threadsafe", "filepost", "http", "https", "ssl", "pooling"], "classifiers": ["Environment :: Web Environment", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Python", "Programming Language :: Python :: 2", "Programming Language :: Python :: 3", "Topic :: Internet :: WWW/HTTP", "Topic :: Software Development :: Libraries"], "extras": ["secure"]} -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3-1.14.dist-info/pbr.json: -------------------------------------------------------------------------------- 1 | {"is_release": false, "git_version": "27df29b"} -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3-1.14.dist-info/top_level.txt: -------------------------------------------------------------------------------- 1 | urllib3 2 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | urllib3 - Thread-safe connection pooling and re-using. 3 | """ 4 | 5 | from __future__ import absolute_import 6 | import warnings 7 | 8 | from .connectionpool import ( 9 | HTTPConnectionPool, 10 | HTTPSConnectionPool, 11 | connection_from_url 12 | ) 13 | 14 | from . import exceptions 15 | from .filepost import encode_multipart_formdata 16 | from .poolmanager import PoolManager, ProxyManager, proxy_from_url 17 | from .response import HTTPResponse 18 | from .util.request import make_headers 19 | from .util.url import get_host 20 | from .util.timeout import Timeout 21 | from .util.retry import Retry 22 | 23 | 24 | # Set default logging handler to avoid "No handler found" warnings. 25 | import logging 26 | try: # Python 2.7+ 27 | from logging import NullHandler 28 | except ImportError: 29 | class NullHandler(logging.Handler): 30 | def emit(self, record): 31 | pass 32 | 33 | __author__ = 'Andrey Petrov (andrey.petrov@shazow.net)' 34 | __license__ = 'MIT' 35 | __version__ = '1.14' 36 | 37 | __all__ = ( 38 | 'HTTPConnectionPool', 39 | 'HTTPSConnectionPool', 40 | 'PoolManager', 41 | 'ProxyManager', 42 | 'HTTPResponse', 43 | 'Retry', 44 | 'Timeout', 45 | 'add_stderr_logger', 46 | 'connection_from_url', 47 | 'disable_warnings', 48 | 'encode_multipart_formdata', 49 | 'get_host', 50 | 'make_headers', 51 | 'proxy_from_url', 52 | ) 53 | 54 | logging.getLogger(__name__).addHandler(NullHandler()) 55 | 56 | 57 | def add_stderr_logger(level=logging.DEBUG): 58 | """ 59 | Helper for quickly adding a StreamHandler to the logger. Useful for 60 | debugging. 61 | 62 | Returns the handler after adding it. 63 | """ 64 | # This method needs to be in this __init__.py to get the __name__ correct 65 | # even if urllib3 is vendored within another package. 66 | logger = logging.getLogger(__name__) 67 | handler = logging.StreamHandler() 68 | handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s')) 69 | logger.addHandler(handler) 70 | logger.setLevel(level) 71 | logger.debug('Added a stderr logging handler to logger: %s', __name__) 72 | return handler 73 | 74 | # ... Clean up. 75 | del NullHandler 76 | 77 | 78 | # SecurityWarning's always go off by default. 79 | warnings.simplefilter('always', exceptions.SecurityWarning, append=True) 80 | # SubjectAltNameWarning's should go off once per host 81 | warnings.simplefilter('default', exceptions.SubjectAltNameWarning) 82 | # InsecurePlatformWarning's don't vary between requests, so we keep it default. 83 | warnings.simplefilter('default', exceptions.InsecurePlatformWarning, 84 | append=True) 85 | # SNIMissingWarnings should go off only once. 86 | warnings.simplefilter('default', exceptions.SNIMissingWarning) 87 | 88 | 89 | def disable_warnings(category=exceptions.HTTPWarning): 90 | """ 91 | Helper for quickly disabling all urllib3 warnings. 92 | """ 93 | warnings.simplefilter('ignore', category) 94 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/_collections.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/_collections.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/connection.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/connection.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/connectionpool.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/connectionpool.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/contrib/__init__.py -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/contrib/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/appengine.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/contrib/appengine.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/ntlmpool.py: -------------------------------------------------------------------------------- 1 | """ 2 | NTLM authenticating pool, contributed by erikcederstran 3 | 4 | Issue #10, see: http://code.google.com/p/urllib3/issues/detail?id=10 5 | """ 6 | from __future__ import absolute_import 7 | 8 | try: 9 | from http.client import HTTPSConnection 10 | except ImportError: 11 | from httplib import HTTPSConnection 12 | from logging import getLogger 13 | from ntlm import ntlm 14 | 15 | from urllib3 import HTTPSConnectionPool 16 | 17 | 18 | log = getLogger(__name__) 19 | 20 | 21 | class NTLMConnectionPool(HTTPSConnectionPool): 22 | """ 23 | Implements an NTLM authentication version of an urllib3 connection pool 24 | """ 25 | 26 | scheme = 'https' 27 | 28 | def __init__(self, user, pw, authurl, *args, **kwargs): 29 | """ 30 | authurl is a random URL on the server that is protected by NTLM. 31 | user is the Windows user, probably in the DOMAIN\\username format. 32 | pw is the password for the user. 33 | """ 34 | super(NTLMConnectionPool, self).__init__(*args, **kwargs) 35 | self.authurl = authurl 36 | self.rawuser = user 37 | user_parts = user.split('\\', 1) 38 | self.domain = user_parts[0].upper() 39 | self.user = user_parts[1] 40 | self.pw = pw 41 | 42 | def _new_conn(self): 43 | # Performs the NTLM handshake that secures the connection. The socket 44 | # must be kept open while requests are performed. 45 | self.num_connections += 1 46 | log.debug('Starting NTLM HTTPS connection no. %d: https://%s%s', 47 | self.num_connections, self.host, self.authurl) 48 | 49 | headers = {} 50 | headers['Connection'] = 'Keep-Alive' 51 | req_header = 'Authorization' 52 | resp_header = 'www-authenticate' 53 | 54 | conn = HTTPSConnection(host=self.host, port=self.port) 55 | 56 | # Send negotiation message 57 | headers[req_header] = ( 58 | 'NTLM %s' % ntlm.create_NTLM_NEGOTIATE_MESSAGE(self.rawuser)) 59 | log.debug('Request headers: %s', headers) 60 | conn.request('GET', self.authurl, None, headers) 61 | res = conn.getresponse() 62 | reshdr = dict(res.getheaders()) 63 | log.debug('Response status: %s %s', res.status, res.reason) 64 | log.debug('Response headers: %s', reshdr) 65 | log.debug('Response data: %s [...]', res.read(100)) 66 | 67 | # Remove the reference to the socket, so that it can not be closed by 68 | # the response object (we want to keep the socket open) 69 | res.fp = None 70 | 71 | # Server should respond with a challenge message 72 | auth_header_values = reshdr[resp_header].split(', ') 73 | auth_header_value = None 74 | for s in auth_header_values: 75 | if s[:5] == 'NTLM ': 76 | auth_header_value = s[5:] 77 | if auth_header_value is None: 78 | raise Exception('Unexpected %s response header: %s' % 79 | (resp_header, reshdr[resp_header])) 80 | 81 | # Send authentication message 82 | ServerChallenge, NegotiateFlags = \ 83 | ntlm.parse_NTLM_CHALLENGE_MESSAGE(auth_header_value) 84 | auth_msg = ntlm.create_NTLM_AUTHENTICATE_MESSAGE(ServerChallenge, 85 | self.user, 86 | self.domain, 87 | self.pw, 88 | NegotiateFlags) 89 | headers[req_header] = 'NTLM %s' % auth_msg 90 | log.debug('Request headers: %s', headers) 91 | conn.request('GET', self.authurl, None, headers) 92 | res = conn.getresponse() 93 | log.debug('Response status: %s %s', res.status, res.reason) 94 | log.debug('Response headers: %s', dict(res.getheaders())) 95 | log.debug('Response data: %s [...]', res.read()[:100]) 96 | if res.status != 200: 97 | if res.status == 401: 98 | raise Exception('Server rejected request: wrong ' 99 | 'username or password') 100 | raise Exception('Wrong server response: %s %s' % 101 | (res.status, res.reason)) 102 | 103 | res.fp = None 104 | log.debug('Connection established') 105 | return conn 106 | 107 | def urlopen(self, method, url, body=None, headers=None, retries=3, 108 | redirect=True, assert_same_host=True): 109 | if headers is None: 110 | headers = {} 111 | headers['Connection'] = 'Keep-Alive' 112 | return super(NTLMConnectionPool, self).urlopen(method, url, body, 113 | headers, retries, 114 | redirect, 115 | assert_same_host) 116 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/ntlmpool.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/contrib/ntlmpool.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/pyopenssl.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/contrib/pyopenssl.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/contrib/socks.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/contrib/socks.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/exceptions.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/exceptions.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/fields.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/fields.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/filepost.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import codecs 3 | 4 | from uuid import uuid4 5 | from io import BytesIO 6 | 7 | from .packages import six 8 | from .packages.six import b 9 | from .fields import RequestField 10 | 11 | writer = codecs.lookup('utf-8')[3] 12 | 13 | 14 | def choose_boundary(): 15 | """ 16 | Our embarassingly-simple replacement for mimetools.choose_boundary. 17 | """ 18 | return uuid4().hex 19 | 20 | 21 | def iter_field_objects(fields): 22 | """ 23 | Iterate over fields. 24 | 25 | Supports list of (k, v) tuples and dicts, and lists of 26 | :class:`~urllib3.fields.RequestField`. 27 | 28 | """ 29 | if isinstance(fields, dict): 30 | i = six.iteritems(fields) 31 | else: 32 | i = iter(fields) 33 | 34 | for field in i: 35 | if isinstance(field, RequestField): 36 | yield field 37 | else: 38 | yield RequestField.from_tuples(*field) 39 | 40 | 41 | def iter_fields(fields): 42 | """ 43 | .. deprecated:: 1.6 44 | 45 | Iterate over fields. 46 | 47 | The addition of :class:`~urllib3.fields.RequestField` makes this function 48 | obsolete. Instead, use :func:`iter_field_objects`, which returns 49 | :class:`~urllib3.fields.RequestField` objects. 50 | 51 | Supports list of (k, v) tuples and dicts. 52 | """ 53 | if isinstance(fields, dict): 54 | return ((k, v) for k, v in six.iteritems(fields)) 55 | 56 | return ((k, v) for k, v in fields) 57 | 58 | 59 | def encode_multipart_formdata(fields, boundary=None): 60 | """ 61 | Encode a dictionary of ``fields`` using the multipart/form-data MIME format. 62 | 63 | :param fields: 64 | Dictionary of fields or list of (key, :class:`~urllib3.fields.RequestField`). 65 | 66 | :param boundary: 67 | If not specified, then a random boundary will be generated using 68 | :func:`mimetools.choose_boundary`. 69 | """ 70 | body = BytesIO() 71 | if boundary is None: 72 | boundary = choose_boundary() 73 | 74 | for field in iter_field_objects(fields): 75 | body.write(b('--%s\r\n' % (boundary))) 76 | 77 | writer(body).write(field.render_headers()) 78 | data = field.data 79 | 80 | if isinstance(data, int): 81 | data = str(data) # Backwards compatibility 82 | 83 | if isinstance(data, six.text_type): 84 | writer(body).write(data) 85 | else: 86 | body.write(data) 87 | 88 | body.write(b'\r\n') 89 | 90 | body.write(b('--%s--\r\n' % (boundary))) 91 | 92 | content_type = str('multipart/form-data; boundary=%s' % boundary) 93 | 94 | return body.getvalue(), content_type 95 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/filepost.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/filepost.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from . import ssl_match_hostname 4 | 5 | __all__ = ('ssl_match_hostname', ) 6 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/packages/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/ordered_dict.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/packages/ordered_dict.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/six.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/packages/six.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/ssl_match_hostname/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | # Python 3.2+ 3 | from ssl import CertificateError, match_hostname 4 | except ImportError: 5 | try: 6 | # Backport of the function from a pypi module 7 | from backports.ssl_match_hostname import CertificateError, match_hostname 8 | except ImportError: 9 | # Our vendored copy 10 | from ._implementation import CertificateError, match_hostname 11 | 12 | # Not needed, but documenting what we provide. 13 | __all__ = ('CertificateError', 'match_hostname') 14 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/ssl_match_hostname/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/packages/ssl_match_hostname/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/ssl_match_hostname/_implementation.py: -------------------------------------------------------------------------------- 1 | """The match_hostname() function from Python 3.3.3, essential when using SSL.""" 2 | 3 | # Note: This file is under the PSF license as the code comes from the python 4 | # stdlib. http://docs.python.org/3/license.html 5 | 6 | import re 7 | 8 | __version__ = '3.4.0.2' 9 | 10 | class CertificateError(ValueError): 11 | pass 12 | 13 | 14 | def _dnsname_match(dn, hostname, max_wildcards=1): 15 | """Matching according to RFC 6125, section 6.4.3 16 | 17 | http://tools.ietf.org/html/rfc6125#section-6.4.3 18 | """ 19 | pats = [] 20 | if not dn: 21 | return False 22 | 23 | # Ported from python3-syntax: 24 | # leftmost, *remainder = dn.split(r'.') 25 | parts = dn.split(r'.') 26 | leftmost = parts[0] 27 | remainder = parts[1:] 28 | 29 | wildcards = leftmost.count('*') 30 | if wildcards > max_wildcards: 31 | # Issue #17980: avoid denials of service by refusing more 32 | # than one wildcard per fragment. A survey of established 33 | # policy among SSL implementations showed it to be a 34 | # reasonable choice. 35 | raise CertificateError( 36 | "too many wildcards in certificate DNS name: " + repr(dn)) 37 | 38 | # speed up common case w/o wildcards 39 | if not wildcards: 40 | return dn.lower() == hostname.lower() 41 | 42 | # RFC 6125, section 6.4.3, subitem 1. 43 | # The client SHOULD NOT attempt to match a presented identifier in which 44 | # the wildcard character comprises a label other than the left-most label. 45 | if leftmost == '*': 46 | # When '*' is a fragment by itself, it matches a non-empty dotless 47 | # fragment. 48 | pats.append('[^.]+') 49 | elif leftmost.startswith('xn--') or hostname.startswith('xn--'): 50 | # RFC 6125, section 6.4.3, subitem 3. 51 | # The client SHOULD NOT attempt to match a presented identifier 52 | # where the wildcard character is embedded within an A-label or 53 | # U-label of an internationalized domain name. 54 | pats.append(re.escape(leftmost)) 55 | else: 56 | # Otherwise, '*' matches any dotless string, e.g. www* 57 | pats.append(re.escape(leftmost).replace(r'\*', '[^.]*')) 58 | 59 | # add the remaining fragments, ignore any wildcards 60 | for frag in remainder: 61 | pats.append(re.escape(frag)) 62 | 63 | pat = re.compile(r'\A' + r'\.'.join(pats) + r'\Z', re.IGNORECASE) 64 | return pat.match(hostname) 65 | 66 | 67 | def match_hostname(cert, hostname): 68 | """Verify that *cert* (in decoded format as returned by 69 | SSLSocket.getpeercert()) matches the *hostname*. RFC 2818 and RFC 6125 70 | rules are followed, but IP addresses are not accepted for *hostname*. 71 | 72 | CertificateError is raised on failure. On success, the function 73 | returns nothing. 74 | """ 75 | if not cert: 76 | raise ValueError("empty or no certificate") 77 | dnsnames = [] 78 | san = cert.get('subjectAltName', ()) 79 | for key, value in san: 80 | if key == 'DNS': 81 | if _dnsname_match(value, hostname): 82 | return 83 | dnsnames.append(value) 84 | if not dnsnames: 85 | # The subject is only checked when there is no dNSName entry 86 | # in subjectAltName 87 | for sub in cert.get('subject', ()): 88 | for key, value in sub: 89 | # XXX according to RFC 2818, the most specific Common Name 90 | # must be used. 91 | if key == 'commonName': 92 | if _dnsname_match(value, hostname): 93 | return 94 | dnsnames.append(value) 95 | if len(dnsnames) > 1: 96 | raise CertificateError("hostname %r " 97 | "doesn't match either of %s" 98 | % (hostname, ', '.join(map(repr, dnsnames)))) 99 | elif len(dnsnames) == 1: 100 | raise CertificateError("hostname %r " 101 | "doesn't match %r" 102 | % (hostname, dnsnames[0])) 103 | else: 104 | raise CertificateError("no appropriate commonName or " 105 | "subjectAltName fields were found") 106 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/packages/ssl_match_hostname/_implementation.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/packages/ssl_match_hostname/_implementation.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/poolmanager.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/poolmanager.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/request.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/request.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/response.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/response.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | # For backwards compatibility, provide imports that used to be here. 3 | from .connection import is_connection_dropped 4 | from .request import make_headers 5 | from .response import is_fp_closed 6 | from .ssl_ import ( 7 | SSLContext, 8 | HAS_SNI, 9 | assert_fingerprint, 10 | resolve_cert_reqs, 11 | resolve_ssl_version, 12 | ssl_wrap_socket, 13 | ) 14 | from .timeout import ( 15 | current_time, 16 | Timeout, 17 | ) 18 | 19 | from .retry import Retry 20 | from .url import ( 21 | get_host, 22 | parse_url, 23 | split_first, 24 | Url, 25 | ) 26 | 27 | __all__ = ( 28 | 'HAS_SNI', 29 | 'SSLContext', 30 | 'Retry', 31 | 'Timeout', 32 | 'Url', 33 | 'assert_fingerprint', 34 | 'current_time', 35 | 'is_connection_dropped', 36 | 'is_fp_closed', 37 | 'get_host', 38 | 'parse_url', 39 | 'make_headers', 40 | 'resolve_cert_reqs', 41 | 'resolve_ssl_version', 42 | 'split_first', 43 | 'ssl_wrap_socket', 44 | ) 45 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/__init__.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/connection.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import socket 3 | try: 4 | from select import poll, POLLIN 5 | except ImportError: # `poll` doesn't exist on OSX and other platforms 6 | poll = False 7 | try: 8 | from select import select 9 | except ImportError: # `select` doesn't exist on AppEngine. 10 | select = False 11 | 12 | 13 | def is_connection_dropped(conn): # Platform-specific 14 | """ 15 | Returns True if the connection is dropped and should be closed. 16 | 17 | :param conn: 18 | :class:`httplib.HTTPConnection` object. 19 | 20 | Note: For platforms like AppEngine, this will always return ``False`` to 21 | let the platform handle connection recycling transparently for us. 22 | """ 23 | sock = getattr(conn, 'sock', False) 24 | if sock is False: # Platform-specific: AppEngine 25 | return False 26 | if sock is None: # Connection already closed (such as by httplib). 27 | return True 28 | 29 | if not poll: 30 | if not select: # Platform-specific: AppEngine 31 | return False 32 | 33 | try: 34 | return select([sock], [], [], 0.0)[0] 35 | except socket.error: 36 | return True 37 | 38 | # This version is better on platforms that support it. 39 | p = poll() 40 | p.register(sock, POLLIN) 41 | for (fno, ev) in p.poll(0.0): 42 | if fno == sock.fileno(): 43 | # Either data is buffered (bad), or the connection is dropped. 44 | return True 45 | 46 | 47 | # This function is copied from socket.py in the Python 2.7 standard 48 | # library test suite. Added to its signature is only `socket_options`. 49 | def create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, 50 | source_address=None, socket_options=None): 51 | """Connect to *address* and return the socket object. 52 | 53 | Convenience function. Connect to *address* (a 2-tuple ``(host, 54 | port)``) and return the socket object. Passing the optional 55 | *timeout* parameter will set the timeout on the socket instance 56 | before attempting to connect. If no *timeout* is supplied, the 57 | global default timeout setting returned by :func:`getdefaulttimeout` 58 | is used. If *source_address* is set it must be a tuple of (host, port) 59 | for the socket to bind as a source address before making the connection. 60 | An host of '' or port 0 tells the OS to use the default. 61 | """ 62 | 63 | host, port = address 64 | if host.startswith('['): 65 | host = host.strip('[]') 66 | err = None 67 | for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM): 68 | af, socktype, proto, canonname, sa = res 69 | sock = None 70 | try: 71 | sock = socket.socket(af, socktype, proto) 72 | 73 | # If provided, set socket level options before connecting. 74 | # This is the only addition urllib3 makes to this function. 75 | _set_socket_options(sock, socket_options) 76 | 77 | if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: 78 | sock.settimeout(timeout) 79 | if source_address: 80 | sock.bind(source_address) 81 | sock.connect(sa) 82 | return sock 83 | 84 | except socket.error as e: 85 | err = e 86 | if sock is not None: 87 | sock.close() 88 | sock = None 89 | 90 | if err is not None: 91 | raise err 92 | 93 | raise socket.error("getaddrinfo returns an empty list") 94 | 95 | 96 | def _set_socket_options(sock, options): 97 | if options is None: 98 | return 99 | 100 | for opt in options: 101 | sock.setsockopt(*opt) 102 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/connection.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/connection.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/request.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from base64 import b64encode 3 | 4 | from ..packages.six import b 5 | 6 | ACCEPT_ENCODING = 'gzip,deflate' 7 | 8 | 9 | def make_headers(keep_alive=None, accept_encoding=None, user_agent=None, 10 | basic_auth=None, proxy_basic_auth=None, disable_cache=None): 11 | """ 12 | Shortcuts for generating request headers. 13 | 14 | :param keep_alive: 15 | If ``True``, adds 'connection: keep-alive' header. 16 | 17 | :param accept_encoding: 18 | Can be a boolean, list, or string. 19 | ``True`` translates to 'gzip,deflate'. 20 | List will get joined by comma. 21 | String will be used as provided. 22 | 23 | :param user_agent: 24 | String representing the user-agent you want, such as 25 | "python-urllib3/0.6" 26 | 27 | :param basic_auth: 28 | Colon-separated username:password string for 'authorization: basic ...' 29 | auth header. 30 | 31 | :param proxy_basic_auth: 32 | Colon-separated username:password string for 'proxy-authorization: basic ...' 33 | auth header. 34 | 35 | :param disable_cache: 36 | If ``True``, adds 'cache-control: no-cache' header. 37 | 38 | Example:: 39 | 40 | >>> make_headers(keep_alive=True, user_agent="Batman/1.0") 41 | {'connection': 'keep-alive', 'user-agent': 'Batman/1.0'} 42 | >>> make_headers(accept_encoding=True) 43 | {'accept-encoding': 'gzip,deflate'} 44 | """ 45 | headers = {} 46 | if accept_encoding: 47 | if isinstance(accept_encoding, str): 48 | pass 49 | elif isinstance(accept_encoding, list): 50 | accept_encoding = ','.join(accept_encoding) 51 | else: 52 | accept_encoding = ACCEPT_ENCODING 53 | headers['accept-encoding'] = accept_encoding 54 | 55 | if user_agent: 56 | headers['user-agent'] = user_agent 57 | 58 | if keep_alive: 59 | headers['connection'] = 'keep-alive' 60 | 61 | if basic_auth: 62 | headers['authorization'] = 'Basic ' + \ 63 | b64encode(b(basic_auth)).decode('utf-8') 64 | 65 | if proxy_basic_auth: 66 | headers['proxy-authorization'] = 'Basic ' + \ 67 | b64encode(b(proxy_basic_auth)).decode('utf-8') 68 | 69 | if disable_cache: 70 | headers['cache-control'] = 'no-cache' 71 | 72 | return headers 73 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/request.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/request.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/response.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from ..packages.six.moves import http_client as httplib 3 | 4 | from ..exceptions import HeaderParsingError 5 | 6 | 7 | def is_fp_closed(obj): 8 | """ 9 | Checks whether a given file-like object is closed. 10 | 11 | :param obj: 12 | The file-like object to check. 13 | """ 14 | 15 | try: 16 | # Check via the official file-like-object way. 17 | return obj.closed 18 | except AttributeError: 19 | pass 20 | 21 | try: 22 | # Check if the object is a container for another file-like object that 23 | # gets released on exhaustion (e.g. HTTPResponse). 24 | return obj.fp is None 25 | except AttributeError: 26 | pass 27 | 28 | raise ValueError("Unable to determine whether fp is closed.") 29 | 30 | 31 | def assert_header_parsing(headers): 32 | """ 33 | Asserts whether all headers have been successfully parsed. 34 | Extracts encountered errors from the result of parsing headers. 35 | 36 | Only works on Python 3. 37 | 38 | :param headers: Headers to verify. 39 | :type headers: `httplib.HTTPMessage`. 40 | 41 | :raises urllib3.exceptions.HeaderParsingError: 42 | If parsing errors are found. 43 | """ 44 | 45 | # This will fail silently if we pass in the wrong kind of parameter. 46 | # To make debugging easier add an explicit check. 47 | if not isinstance(headers, httplib.HTTPMessage): 48 | raise TypeError('expected httplib.Message, got {0}.'.format( 49 | type(headers))) 50 | 51 | defects = getattr(headers, 'defects', None) 52 | get_payload = getattr(headers, 'get_payload', None) 53 | 54 | unparsed_data = None 55 | if get_payload: # Platform-specific: Python 3. 56 | unparsed_data = get_payload() 57 | 58 | if defects or unparsed_data: 59 | raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data) 60 | 61 | 62 | def is_response_to_head(response): 63 | """ 64 | Checks whether the request of a response has been a HEAD-request. 65 | Handles the quirks of AppEngine. 66 | 67 | :param conn: 68 | :type conn: :class:`httplib.HTTPResponse` 69 | """ 70 | # FIXME: Can we do this somehow without accessing private httplib _method? 71 | method = response._method 72 | if isinstance(method, int): # Platform-specific: Appengine 73 | return method == 3 74 | return method.upper() == 'HEAD' 75 | -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/response.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/response.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/retry.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/retry.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/ssl_.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/ssl_.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/timeout.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/timeout.pyc -------------------------------------------------------------------------------- /lambda_functions/pdf_text_extract/urllib3/util/url.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/theemadnes/PDF_text_extract/cc077ebec7b021cd0c4bc36b6866ca30a37a776b/lambda_functions/pdf_text_extract/urllib3/util/url.pyc --------------------------------------------------------------------------------