├── pewtils
    ├── VERSION
    ├── general_link_shorteners.csv
    ├── regex.py
    ├── vanity_link_shorteners.csv
    ├── io.py
    ├── http.py
    └── __init__.py
├── tests
    ├── files
    │   ├── __init__.py
    │   ├── subfolder
    │   │   ├── __init__.py
    │   │   ├── subfolder_json.json
    │   │   └── subfolder_py.py
    │   ├── json.json
    │   ├── py.py
    │   ├── example_stripped.html
    │   ├── example_stripped_simple.html
    │   └── example.html
    ├── __init__.py
    ├── regex.py
    ├── http.py
    ├── io.py
    └── base.py
├── docs_source
    ├── _static
    │   ├── .gitkeep
    │   └── theme_overrides.css
    ├── regex.rst
    ├── io.rst
    ├── http.rst
    ├── http_link_shorteners.rst
    ├── pewtils_core.rst
    ├── index.rst
    ├── conf.py
    └── examples.rst
├── MANIFEST.in
├── .gitignore
├── .bulldozer.yml
├── CHANGELOG.md
├── requirements.txt
├── .coveragerc
├── .bumpversion.cfg
├── .policy.yml
├── .github
    ├── workflows
    │   ├── build-docs.yaml
    │   ├── unit-tests.yaml
    │   ├── build-release.yaml
    │   └── build-main.yaml
    └── runner.yaml
├── setup.py
├── Makefile
├── CONTRIBUTING.md
├── README.md
└── LICENSE


/pewtils/VERSION:
--------------------------------------------------------------------------------
1 | 0.1.1


--------------------------------------------------------------------------------
/tests/files/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs_source/_static/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/files/subfolder/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/files/json.json:
--------------------------------------------------------------------------------
1 | {
2 |   "test_val": 1
3 | }


--------------------------------------------------------------------------------
/tests/files/py.py:
--------------------------------------------------------------------------------
1 | def test():
2 |     return "test1"
3 | 


--------------------------------------------------------------------------------
/tests/files/subfolder/subfolder_json.json:
--------------------------------------------------------------------------------
1 | {
2 |   "test_val": 2
3 | }


--------------------------------------------------------------------------------
/tests/files/subfolder/subfolder_py.py:
--------------------------------------------------------------------------------
1 | def test():
2 |     return "test2"
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | include pewtils/*.csv
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env/
2 | .ipynb_checkpoints
3 | *.pyc
4 | .python-version
5 | .idea/
6 | _build/
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import *
2 | from .http import *
3 | from .io import *
4 | from .regex import *
5 | 


--------------------------------------------------------------------------------
/tests/files/example_stripped.html:
--------------------------------------------------------------------------------
1 | Example Domain
2 | 
3 | This domain is established to be used for illustrative examples in documents. You may use this domain in examples without prior coordination or asking for permission.
4 | 
5 | More information...


--------------------------------------------------------------------------------
/.bulldozer.yml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | 
 3 | merge:
 4 |   method: merge
 5 |   whitelist:
 6 |     labels: ["automerge"]
 7 |     comment_substrings: ["==AUTOMERGE=="]
 8 |   blacklist:
 9 |     labels: ["block"]
10 |     comment_substrings: ["==BLOCK=="]
11 | 
12 | update:
13 |   whitelist:
14 |     labels: ["wip", "update me"]
15 | 


--------------------------------------------------------------------------------
/docs_source/regex.rst:
--------------------------------------------------------------------------------
 1 | **************
 2 | Regex Patterns
 3 | **************
 4 | 
 5 | This module contains a modest but growing collection of useful regular expressions, useful for \
 6 | extracting things like URLs, monetary values, and more.
 7 | 
 8 | .. automodule :: pewtils.regex
 9 |     :autosummary:
10 |     :members:
11 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Release Notes
 2 | 
 3 | ## 1.1.2
 4 | 
 5 | - Fix some FileHandler interactions with S3
 6 | - Code linting improvements
 7 | 
 8 | ## 1.1.1
 9 | 
10 | - Release repo history
11 | 
12 | ## 1.1.0
13 | 
14 | - Replace FileHandler's dependency on boto2 with boto3, which is required for role-based authentication with AWS services
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Unidecode>=1.1.1
 2 | beautifulsoup4>=4.10.0
 3 | boto>=2.49.0
 4 | chardet>=4.0.0
 5 | fake_useragent>=0.1.11
 6 | lxml>=4.4.2
 7 | nilsimsa>=0.3.8
 8 | numpy>=1.18.1
 9 | pandas>=0.25.3
10 | requests>=2.25.1
11 | scandir>=1.10.0
12 | six>=1.16.0
13 | ssdeep>=3.4
14 | stopit>=1.1.2
15 | tldextract>=2.2.2
16 | zipcodes>=1.1.0
17 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
 1 | # .coveragerc to control coverage.py
 2 | [run]
 3 | branch = False
 4 | cover_pylib = False
 5 | source = pewtils
 6 | omit =
 7 |     */site-packages/*
 8 |     pewtils/internal/*
 9 | 
10 | [report]
11 | ignore_errors = True
12 | exclude_lines =
13 |     pragma: no cover
14 |     def __repr__
15 |     except
16 | omit =
17 |     */site-packages/*
18 |     pewtils/internal/*


--------------------------------------------------------------------------------
/docs_source/_static/theme_overrides.css:
--------------------------------------------------------------------------------
 1 | /* override table width restrictions */
 2 | @media screen and (min-width: 767px) {
 3 | 
 4 |    .wy-table-responsive table td {
 5 |       /* !important prevents the common CSS stylesheets from overriding
 6 |          this as on RTD they are loaded after this stylesheet */
 7 |       white-space: normal !important;
 8 |    }
 9 | 
10 |    .wy-table-responsive {
11 |       overflow: visible !important;
12 |    }
13 | }


--------------------------------------------------------------------------------
/.bumpversion.cfg:
--------------------------------------------------------------------------------
 1 | [bumpversion]
 2 | current_version = 1.1.6.dev1
 3 | commit = False
 4 | tag = False
 5 | parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>\w+)(?P<build>\d+))?
 6 | serialize = 
 7 | 	{major}.{minor}.{patch}.{release}{build}
 8 | 	{major}.{minor}.{patch}
 9 | 
10 | [bumpversion:part:release]
11 | first_value = dev
12 | optional_value = prod
13 | values = 
14 | 	dev
15 | 	prod
16 | 
17 | [bumpversion:part:build]
18 | 
19 | [bumpversion:file:setup.py]
20 | 
21 | [bumpversion:file:docs_source/conf.py]
22 | 


--------------------------------------------------------------------------------
/docs_source/io.rst:
--------------------------------------------------------------------------------
 1 | **************
 2 | I/O Tools
 3 | **************
 4 | 
 5 | This module contains utilities related to reading and writing files in a variety of formats. \
 6 | Right now, it consists exclusively of the :py:class:`pewtils.io.FileHandler` class, which provides \
 7 | a standardized interface for loading and saving data both locally and on Amazon S3. It doesn't \
 8 | always work exactly as intended, but 99% of the time, it gives us a way to read and write files \
 9 | with just one or two lines of code - and accordingly, we use it everywhere. We hope you do too!
10 | 
11 | .. automodule :: pewtils.io
12 |     :autosummary:
13 |     :members:
14 | 


--------------------------------------------------------------------------------
/pewtils/general_link_shorteners.csv:
--------------------------------------------------------------------------------
 1 | shortener
 2 | abre.ai
 3 | adf.ly
 4 | bit.do
 5 | bit.do
 6 | bit.ly
 7 | bitly.com
 8 | buff.ly
 9 | crwd.fr
10 | cutt.ly
11 | disq.us
12 | dlvr.it
13 | every.tw
14 | flip.it
15 | fus.in
16 | fw.to
17 | fwdaga.in
18 | goo.gl
19 | ht.ly
20 | ht.ly
21 | hub.am
22 | hubs.ly
23 | is.gd
24 | j.mp
25 | lnks.gd
26 | loom.ly
27 | lsh.re
28 | more.pr
29 | msgp.pl
30 | mvnt.us
31 | ora.cl
32 | ow.ly
33 | owl.li
34 | pgj.cc
35 | po.st
36 | qoo.ly
37 | rb.gy
38 | scr.bi
39 | shar.es
40 | shoo.ly
41 | shout.lt
42 | shr.gs
43 | snip.ly
44 | snp.tv
45 | spr.ly
46 | su.pr
47 | t.co
48 | tiny.cc
49 | tinyurl.com
50 | trib.al
51 | trib.it
52 | urlm.in
53 | v.ht
54 | wp.me


--------------------------------------------------------------------------------
/tests/files/example_stripped_simple.html:
--------------------------------------------------------------------------------
1 |  Example Domain body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; } div { width: 600px; margin: 5em auto; padding: 50px; background-color: #fff; border-radius: 1em; } a:link, a:visited { color: #38488f; text-decoration: none; } @media (max-width: 700px) { body { background-color: #fff; } div { width: auto; margin: 0 auto; border-radius: 0; padding: 1em; } } 
2 |  Example Domain 
3 | This domain is established to be used for illustrative examples in documents. You may use this domain in examples without prior coordination or asking for permission.
4 |  
5 |  More information... 
6 |  
7 |  


--------------------------------------------------------------------------------
/.policy.yml:
--------------------------------------------------------------------------------
 1 | policy:
 2 |   approval:
 3 |   - or:
 4 |     - deploy updates
 5 |     - submodule updates
 6 |     - anointed maintainers say yes
 7 | 
 8 | approval_rules:
 9 | - name: anointed maintainers say yes
10 |   options:
11 |     allow_contributor: true
12 |     invalidate_on_push: true
13 |   requires:
14 |     count: 1
15 |     teams:
16 |       - "pewresearch/pewtils-maintainers"
17 |     write_collaborators: true
18 | 
19 | - name: deploy updates
20 |   options:
21 |     invalidate_on_push: true
22 |   if:
23 |     only_changed_files:
24 |       paths:
25 |       - '^deploy/.*'
26 | 
27 | - name: submodule updates
28 |   options:
29 |     invalidate_on_push: true
30 |   if:
31 |     only_changed_files:
32 |       paths:
33 |       - '^src/.*'
34 | 


--------------------------------------------------------------------------------
/.github/workflows/build-docs.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |     paths:
 6 |       - .github/workflows/build-docs.yaml
 7 |       - Makefile
 8 |       - docs_source/**
 9 | 
10 | name: build-docs
11 | 
12 | jobs:
13 |   build-docs:
14 |     runs-on: pewtils-runner
15 |     name: build docs
16 |     permissions:
17 |       contents: write
18 |     env:
19 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v2
23 | 
24 |       - name: Install python dependencies
25 |         run : |
26 |           while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt
27 |           pip install -r requirements.txt
28 | 
29 |       - name: Build docs html
30 |         run: |
31 |           if [[ "${{ github.repository }}" == "pewresearch/pewtils" ]]; then
32 |               make github_docs
33 |           else
34 |               make s3_docs
35 |           fi
36 | 


--------------------------------------------------------------------------------
/docs_source/http.rst:
--------------------------------------------------------------------------------
 1 | **************
 2 | HTTP Utilities
 3 | **************
 4 | 
 5 | In this module, you'll find a variety of useful functions for working with web data. \
 6 | The :py:func:`pewtils.http.canonical_link` function is our best attempt at standardizing and cleaning a URL without \
 7 | losing any information, and the :py:func:`pewtils.http.strip_html` function is useful for attempting to extract text \
 8 | from raw HTML data with minimal fine-tuning.
 9 | 
10 | .. automodule :: pewtils.http
11 |     :autosummary:
12 |     :members:
13 | 
14 | +++++++++++++++
15 | Link Shorteners
16 | +++++++++++++++
17 | 
18 | List of link shorteners recognized by methods in this section.
19 | 
20 | General Link Shorteners
21 | ^^^^^^^^^^^^^^^^^^^^^^^
22 | 
23 |     A list of known :ref:`gen_link_shorteners`.
24 | 
25 | Vanity Link Shorteners
26 | ^^^^^^^^^^^^^^^^^^^^^^^
27 | 
28 |     A list of known URL shorteners for websites specific :ref:`vanity_link_shorteners` (primarily news websites).
29 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-tests.yaml:
--------------------------------------------------------------------------------
 1 | name: unit-tests
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - main
 7 |     paths:
 8 |       - .github/workflows/unit-tests.yaml
 9 |       - Makefile
10 |       - pewtils/**
11 |       - requirements.txt
12 |       - setup.py
13 |       - tests/**
14 | 
15 | jobs:
16 |   unit-tests:
17 |     name: unit-tests
18 |     runs-on: pewtils-runner
19 |     env:
20 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
21 |     permissions:
22 |       contents: read
23 |       pull-requests: write
24 | 
25 |     steps:
26 |       - uses: actions/checkout@v2
27 | 
28 |       - name: Install python dependencies
29 |         run: |
30 |           while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt
31 |           pip install -r requirements.txt
32 | 
33 |       - name: Lint with flake8
34 |         env:
35 |           REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
36 |         run: |
37 |           make python_lint_errors
38 |           make github_lint_flake8
39 | 
40 |       - name: Run unit tests
41 |         run: |
42 |           make python_test
43 | 


--------------------------------------------------------------------------------
/docs_source/http_link_shorteners.rst:
--------------------------------------------------------------------------------
 1 | .. _link_shorteners:
 2 | 
 3 | ***************
 4 | Link Shorteners
 5 | ***************
 6 | 
 7 | Lists of known link shorteners recognized by ``pewtils.http`` utility methods such \
 8 | as :py:func:`pewtils.http.canonical_link` and :py:func:`pewtils.http.extract_domain_from_url`. These lists were \
 9 | compiled from several collections of shortened links found in social media posts and news articles, so most of the \
10 | shorteners belong to news outlets and large popular websites, especially those with political content. Since domains \
11 | can be retired or may change ownership and get redirected to different websites over time, these lists may not \
12 | be perfectly accurate. We will try to keep them updated as we become aware of changes, but if you notice any \
13 | inaccuracies or wish to add to these lists, please consider making a pull request!
14 | 
15 | .. _gen_link_shorteners:
16 | .. csv-table:: Generic Link Shorteners
17 |    :file: ../pewtils/general_link_shorteners.csv
18 |    :widths: 30
19 |    :header-rows: 1
20 | 
21 | .. _vanity_link_shorteners:
22 | .. csv-table:: Vanity Link Shorteners
23 |    :file: ../pewtils/vanity_link_shorteners.csv
24 |    :widths: 30, 30, 30
25 |    :header-rows: 1
26 | 


--------------------------------------------------------------------------------
/docs_source/pewtils_core.rst:
--------------------------------------------------------------------------------
 1 | **************
 2 | Core Functions
 3 | **************
 4 | 
 5 | The main Pewtils module contains a variety of generally useful functions that make our researchers \
 6 | lives easier. For those still working in Python 2.x, the :py:func:`pewtils.decode_text` function can help alleviate \
 7 | headaches related to text encodings. The :py:func:`pewtils.is_null` and :py:func:`pewtils.is_not_null` functions \
 8 | provide an easy way to deal with the wide variety of possible null values that exist in the Python (and broader \
 9 | research universe) by using a best-guess approach. When working with dictionaries or JSON records that need \
10 | to be updated, :py:func:`pewtils.recursive_update` makes it easy to map one version of an object onto another. While \
11 | we strive to write efficient code that can cover every possible use-case, there are certainly some \
12 | edge cases that we haven't encountered, and other existing Python libraries may very well provide \
13 | many of these same features. This collection simply consists of functions we find ourselves using \
14 | again and again, and we hope that Pewtils may help expand your daily toolkit in some way as well.
15 | 
16 | .. automodule :: pewtils.__init__
17 |     :autosummary:
18 |     :members:
19 | 


--------------------------------------------------------------------------------
/pewtils/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | URL_REGEX = re.compile(
 5 |     r"((?:https?:\/\/(?:www\.)?)?[-a-zA-Z0-9@:%._\+~#=]{1,4096}\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*))"
 6 | )
 7 | """
 8 | A compiled regular expression for extracting (probably) valid URLs.
 9 | """
10 | 
11 | DOMAIN_REGEX = re.compile(
12 |     r"(?:http[s]?\:\/\/)?(?:www(?:s?)\.)?([\w\.\-]+)(?:[\\\/](?:.+))?"
13 | )
14 | """
15 | A compiled regular expression for extracting domains from URLs. Can be useful in a pinch but we recommend \
16 | using the :py:func:`pewtils.http.extract_domain_from_url` instead.
17 | """
18 | 
19 | HTTP_REGEX = re.compile(r"^http(?:s)?\:\/\/")
20 | """
21 | A compiled regular expression for finding HTTP/S prefixes.
22 | """
23 | 
24 | US_DOLLAR_REGEX = re.compile(
25 |     r"(\$(?:[1-9][0-9]{0,2}(?:(?:\,[0-9]{3})+)?(?:\.[0-9]{1,2})?))\b"
26 | )
27 | """
28 | A compiled regular expression finding USD monetary amounts.
29 | """
30 | 
31 | TITLEWORD_REGEX = re.compile(r"\b([A-Z][a-z]+)\b")
32 | """
33 | A compiled regular expression for finding basic title-cased words.
34 | """
35 | 
36 | NUMBER_REGEX = re.compile(r"\b([0-9]+)\b")
37 | """
38 | A compiled regular expression for finding raw numbers.
39 | """
40 | 
41 | NONALPHA_REGEX = re.compile(r"[^\w]")
42 | """
43 | A compiled regular expression for finding non-alphanumeric values.
44 | """
45 | 


--------------------------------------------------------------------------------
/tests/files/example.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!doctype html>
 3 | <html>
 4 | <head>
 5 |     <title>Example Domain</title>
 6 | 
 7 |     <meta charset="utf-8" />
 8 |     <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
 9 |     <meta name="viewport" content="width=device-width, initial-scale=1" />
10 |     <style type="text/css">
11 |     body {
12 |         background-color: #f0f0f2;
13 |         margin: 0;
14 |         padding: 0;
15 |         font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
16 | 
17 |     }
18 |     div {
19 |         width: 600px;
20 |         margin: 5em auto;
21 |         padding: 50px;
22 |         background-color: #fff;
23 |         border-radius: 1em;
24 |     }
25 |     a:link, a:visited {
26 |         color: #38488f;
27 |         text-decoration: none;
28 |     }
29 |     @media (max-width: 700px) {
30 |         body {
31 |             background-color: #fff;
32 |         }
33 |         div {
34 |             width: auto;
35 |             margin: 0 auto;
36 |             border-radius: 0;
37 |             padding: 1em;
38 |         }
39 |     }
40 |     </style>
41 | </head>
42 | 
43 | <body>
44 | <div>
45 |     <h1>Example Domain</h1>
46 |     <p>This domain is established to be used for illustrative examples in documents. You may use this
47 |     domain in examples without prior coordination or asking for permission.</p>
48 |     <p><a href="http://www.iana.org/domains/example">More information...</a></p>
49 | </div>
50 | </body>
51 | </html>
52 | 


--------------------------------------------------------------------------------
/.github/workflows/build-release.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     tags:
 4 |       - 'v*.*.*'
 5 | 
 6 | name: build-release
 7 | 
 8 | jobs:
 9 |   build-release:
10 |     runs-on: pewtils-runner
11 |     name: build release package
12 |     env:
13 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
14 |     permissions:
15 |       contents: write
16 | 
17 |     steps:
18 |       - uses: actions/checkout@v2
19 | 
20 |       - name: Install python dependencies
21 |         run : |
22 |           while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt
23 |           pip install -r requirements.txt
24 | 
25 |       - name: Lint with flake8
26 |         run: |
27 |           make python_lint_errors
28 |           make python_lint_quality
29 | 
30 |       - name: Run unit tests
31 |         run: |
32 |           make python_test
33 | 
34 |       - name: Build Python package
35 |         run: |
36 |           make python_build
37 | 
38 |       - name: Upload to Nexus Repository
39 |         run: |
40 |           twine upload --non-interactive --repository-url '${{ secrets.PACKAGE_REPO_URL_PYTHON }}' --username '${{ secrets.PACKAGE_REPO_USER }}' --password '${{ secrets.PACKAGE_REPO_PASSWORD }}' dist/*
41 | 
42 |       - name: Publish Release
43 |         uses: softprops/action-gh-release@v1
44 |         env:
45 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
46 |         with:
47 |           prerelease: False
48 |           body_path: CHANGELOG.md
49 |           files: |
50 |             *.whl
51 | 


--------------------------------------------------------------------------------
/.github/workflows/build-main.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |     paths:
 6 |       - .github/workflows/build-main.yaml
 7 |       - Makefile
 8 |       - pewtils/**
 9 |       - requirements.txt
10 |       - setup.py
11 |       - tests/**
12 | 
13 | name: build-main
14 | 
15 | jobs:
16 |   build-main:
17 |     runs-on: pewtils-runner
18 |     name: build main branch package
19 |     env:
20 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
21 |     permissions:
22 |       contents: write
23 | 
24 |     steps:
25 |       - uses: actions/checkout@v2
26 | 
27 |       - name: Bump the build version
28 |         run: |
29 |           git config --global user.name "Github Actions"
30 |           git config --global user.email "<>"
31 |           make bump
32 | 
33 |       - name: Install python dependencies
34 |         run : |
35 |           while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt
36 |           pip install -r requirements.txt
37 | 
38 |       - name: Lint with flake8
39 |         run: |
40 |           make python_lint_errors
41 |           make python_lint_quality
42 | 
43 |       - name: Run unit tests
44 |         run: |
45 |           make python_test
46 | 
47 |       - name: Build Python package
48 |         run: |
49 |           make python_build
50 | 
51 |       - name: Upload to Package Repository
52 |         run: |
53 |           twine upload --non-interactive --repository-url '${{ secrets.PACKAGE_REPO_URL_PYTHON }}' --username '${{ secrets.PACKAGE_REPO_USER }}' --password '${{ secrets.PACKAGE_REPO_PASSWORD }}' dist/*
54 | 
55 |       - name: Sync new build commits
56 |         run: |
57 |           make sync_branch
58 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md") as README:
 4 |     readme = str(README.read())
 5 | 
 6 | with open("requirements.txt") as reqs:
 7 |     lines = reqs.read().split("\n")
 8 |     install_requires = [line for line in lines if line]
 9 | 
10 | setup(
11 |     name="pewtils",
12 |     version="1.1.6.dev1",
13 |     description="General programming utilities from Pew Research Center",
14 |     long_description=readme,
15 |     long_description_content_type="text/markdown",
16 |     url="https://github.com/pewresearch/pewtils",
17 |     author="Pew Research Center",
18 |     author_email="info@pewresearch.org",
19 |     install_requires=install_requires,
20 |     packages=find_packages(exclude=["contrib", "docs", "tests"]),
21 |     include_package_data=True,
22 |     keywords="utilities, link standardization, input, output",
23 |     license="GPLv2+",
24 |     classifiers=[
25 |         # https://pypi.python.org/pypi?%3Aaction=list_classifiers
26 |         "Development Status :: 5 - Production/Stable",
27 |         # "Development Status :: 6 - Mature",
28 |         # "Development Status :: 7 - Inactive",
29 |         "Environment :: Console",
30 |         "Intended Audience :: Developers",
31 |         "Intended Audience :: Education",
32 |         "Intended Audience :: Information Technology",
33 |         "Intended Audience :: Science/Research",
34 |         "License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)",
35 |         "Operating System :: OS Independent",
36 |         "Programming Language :: Python :: 3.7",
37 |         "Programming Language :: Python :: 3.8",
38 |         "Programming Language :: Python :: 3.9",
39 |         "Programming Language :: Python",
40 |         "Topic :: Software Development :: Libraries :: Python Modules",
41 |         "Topic :: Utilities",
42 |     ],
43 | )
44 | 


--------------------------------------------------------------------------------
/.github/runner.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: actions.summerwind.dev/v1alpha1
 2 | kind: RunnerDeployment
 3 | metadata:
 4 |   name: pewtils-dev-runner
 5 |   namespace: github-runners
 6 | 
 7 | spec:
 8 |   replicas: 1
 9 |   template:
10 |     spec:
11 |       repository: pewresearch/pewtils_dev
12 |       image: 458280294434.dkr.ecr.us-east-1.amazonaws.com/labs-actions-runner@sha256:41a92e6db53febef2db892cea45680d480dce6c8f576367b1245d57f017e7935
13 |       imagePullPolicy: Always
14 |       serviceAccountName: labs-runner
15 |       labels:
16 |         - pewtils-runner
17 |       dockerEnabled: false
18 |       dockerdContainerResources:
19 |         limits:
20 |           cpu: "4.0"
21 |           memory: "8Gi"
22 | 
23 |         requests:
24 |           cpu: "100m"
25 |           memory: "2Gi"
26 | 
27 |       env:
28 |         - name: AWS_ACCESS_KEY_ID
29 |           valueFrom:
30 |             secretKeyRef:
31 |               name: labs-runner
32 |               key: AWS_ACCESS_KEY_ID
33 | 
34 |         - name: AWS_SECRET_ACCESS_KEY
35 |           valueFrom:
36 |             secretKeyRef:
37 |               name: labs-runner
38 |               key: AWS_SECRET_ACCESS_KEY
39 | 
40 |     metadata:
41 |       annotations:
42 |         cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
43 | 
44 | ---
45 | 
46 | apiVersion: actions.summerwind.dev/v1alpha1
47 | kind: RunnerDeployment
48 | metadata:
49 |   name: pewtils-runner
50 |   namespace: github-runners
51 | 
52 | spec:
53 |   replicas: 1
54 |   template:
55 |     spec:
56 |       repository: pewresearch/pewtils
57 |       image: 458280294434.dkr.ecr.us-east-1.amazonaws.com/labs-actions-runner@sha256:41a92e6db53febef2db892cea45680d480dce6c8f576367b1245d57f017e7935
58 |       imagePullPolicy: Always
59 |       serviceAccountName: labs-runner
60 |       labels:
61 |         - pewtils-runner
62 |       dockerEnabled: false
63 |       dockerdContainerResources:
64 |         limits:
65 |           cpu: "4.0"
66 |           memory: "8Gi"
67 | 
68 |         requests:
69 |           cpu: "100m"
70 |           memory: "2Gi"
71 | 
72 |     metadata:
73 |       annotations:
74 |         cluster-autoscaler.kubernetes.io/safe-to-evict: "true"
75 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BRANCH := $(shell git rev-parse --symbolic-full-name --abbrev-ref HEAD)
 2 | 
 3 | # by default, we'll bump the "build" part of the version, for non-releases
 4 | PART = build
 5 | 
 6 | # if the current version is a release and not a dev build, bump the patch part instead
 7 | VERSION := $(shell grep -Po '(?<=current_version = )[\w\d\.]+' .bumpversion.cfg)
 8 | ifeq (,$(findstring dev,$(VERSION)))
 9 | 	ifeq ($(PART),build)
10 | 		PART = patch
11 | 	endif
12 | endif
13 | 
14 | # Minimal makefile for Sphinx documentation
15 | 
16 | SPHINXOPTS	=
17 | SPHINXBUILD = sphinx-build
18 | SOURCEDIR   = docs_source
19 | BUILDDIR	= _build
20 | 
21 | # Put it first so that "make" without argument is like "make help".
22 | help:
23 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 
25 | .PHONY: help Makefile
26 | 
27 | docs:
28 | 	-rm -rf _build/
29 | 	make html
30 | 
31 | s3_docs: docs
32 | 	aws s3 sync --delete _build/html/ s3://docs.pewresearch.tech/pewtils/
33 | 
34 | github_docs:
35 | 	make html
36 | 	-mv _build/html /tmp/html
37 | 	-rm -rf _build
38 | 	-git branch -D docs
39 | 	git fetch --all
40 | 	git checkout docs
41 | 	-mv .git /tmp/.git
42 | 	-rm -rf * .*
43 | 	-mv /tmp/.git .
44 | 	cp -a /tmp/html/. .
45 | 	-rm -rf /tmp/html
46 | 	git add -A .
47 | 	git commit -m "latest docs"
48 | 	git push origin docs
49 | 	git checkout $(BRANCH)
50 | 
51 | python_lint_errors:
52 | 	# stop the build if there are Python syntax errors or undefined names
53 | 	flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=.git,__pycache__,build,dist
54 | 
55 | python_lint_quality:
56 | 	flake8 . --exit-zero --statistics --count --show-source --max-line-length=127 --ignore=E201,E202,E501,E722,W503,W504 --exclude=.git,__pycache__,build,dist
57 | 
58 | github_lint_flake8:
59 | 	flake8 . --max-line-length 127 --ignore=E201,E202,E501,E722,W503,W504 --exclude=.git,__pycache__,build,dist | reviewdog -reporter=github-pr-check -f=flake8
60 | 
61 | python_test:
62 | 	python3 -m unittest tests
63 | 
64 | python_build:
65 | 	python3 setup.py sdist bdist_wheel
66 | 
67 | .ONESHELL:
68 | bump:
69 | 	git checkout $(BRANCH)
70 | 	git pull origin $(BRANCH)
71 | 	bumpversion --commit $(PART)
72 | 
73 | .ONESHELL:
74 | sync_branch:
75 | 	git checkout $(BRANCH)
76 | 	git pull origin $(BRANCH)
77 | 	git push origin $(BRANCH)
78 | 
79 | .ONESHELL:
80 | release:
81 | 	git checkout $(BRANCH)
82 | 	git pull origin $(BRANCH)
83 | 	bumpversion --commit $(PART)
84 | 	bumpversion --commit --tag release
85 | 	git push origin $(BRANCH) --follow-tags
86 | 
87 | # Catch-all target: route all unknown targets to Sphinx using the new
88 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
89 | %: Makefile
90 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
91 | 


--------------------------------------------------------------------------------
/docs_source/index.rst:
--------------------------------------------------------------------------------
 1 | Pewtils
 2 | ===================================================================
 3 | 
 4 | Pewtils is a package of useful programming utilities developed at the Pew Research Center \
 5 | over the years. Most of the functions in Pewtils can be found in the root module, while a \
 6 | handful of submodules contain more specialized utilities for working with files, web \
 7 | resources, and regular expressions.
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: Table of Contents:
12 | 
13 |    Core Functions <pewtils_core>
14 |    HTTP Utilities <http>
15 |    I/O Tools <io>
16 |    Regex Patterns <regex>
17 |    Examples <examples>
18 | 
19 | Installation
20 | ---------------
21 | 
22 | To install, you can use ``pip``:
23 | 
24 |     .. code-block:: bash
25 | 
26 |         pip install git+https://github.com/pewresearch/pewtils#egg=pewtils
27 | 
28 | Or you can install from source:
29 | 
30 |     .. code-block:: bash
31 | 
32 |         git clone https://github.com/pewresearch/pewtils.git
33 |         cd pewtils
34 |         python setup.py install
35 | 
36 | .. note::
37 |     This is a Python 3 package. Though it is compatible with Python 2, many of its dependencies are \
38 |     planning to drop support for earlier versions if they haven't already. We highly recommend \
39 |     you upgrade to Python 3.
40 | 
41 | Installation Troubleshooting
42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
43 | 
44 | Using 64-bit Python
45 | """"""""""""""""""""
46 | 
47 | Some of our libraries require the use of 64-bit Python. If you encounter errors during installation \
48 | that are related to missing libraries, you may be using 32-bit Python. We recommend that you uninstall \
49 | this version and switch to a 64-bit version instead. On Windows, these will be marked with ``x86-64``; you \
50 | can find the latest 64-bit versions of Python `here <http://www.python.org/downloads>`_.
51 | 
52 | Installing ssdeep
53 | """"""""""""""""""""""""""""
54 | 
55 | ssdeep is an optional dependency that can be used by the :py:func:`pewtils.get_hash` function in Pewtils. \
56 | Installation instructions for various Linux distributions can be found in the library's \
57 | `documentation <https://python-ssdeep.readthedocs.io/en/latest/installation.html>`_. The ssdeep \
58 | Python library is not currently compatible with Windows. \
59 | Installing ssdeep on Mac OS may involve a few additional steps, detailed below:
60 | 
61 | 1. Install Homebrew
62 | 
63 | 2. Install xcode
64 | 
65 |     .. code-block:: bash
66 | 
67 |         xcode-select --install
68 | 
69 | 3. Install system dependencies
70 | 
71 |     .. code-block:: bash
72 | 
73 |         brew install pkg-config libffi libtool automake
74 |         ln -s /usr/local/bin/glibtoolize /usr/local/bin/libtoolize
75 | 
76 | 4. Install ssdeep with an additional flag to build the required libraries
77 | 
78 |     .. code-block:: bash
79 | 
80 |         BUILD_LIB=1 pip install ssdeep
81 | 
82 | 5. If step 4 fails, you may need to redirect your system to the new libraries by setting the following flags:
83 | 
84 |     .. code-block:: bash
85 | 
86 |         export LIBTOOL=`which glibtool`
87 |         export LIBTOOLIZE=`which glibtoolize`
88 | 
89 |     Do this and try step 4 again.
90 | 
91 | 6. Now you should be able to run the main installation process detailed above.
92 | 
93 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Pewtils
 2 | 
 3 | <!-- This CONTRIBUTING.md is adapted from https://gist.github.com/peterdesmet/e90a1b0dc17af6c12daf6e8b2f044e7c -->
 4 | 
 5 | [repo]: https://github.com/pewresearch/pewtils
 6 | [issues]: https://github.com/pewresearch/pewtils/issues
 7 | [new_issue]: https://github.com/pewresearch/pewtils/issues/new
 8 | [email]: info@pewresearch.org
 9 | 
10 | ## How you can contribute
11 | 
12 | There are several ways you can contribute to this project. If you want to know more about why and how to contribute to open source projects like this one, see this [Open Source Guide](https://opensource.guide/how-to-contribute/).
13 | 
14 | ### Share the love ❤️
15 | 
16 | Think **pewtils** is useful? Let others discover it, by telling them in person, via Twitter or a blog post.
17 | 
18 | ### Ask a question ⁉️
19 | 
20 | Using **pewtils** and got stuck? Check out the [documentation](https://pewresearch.github.io/pewtils/). 
21 | Still stuck? Post your question as an [issue on GitHub][new_issue]. While we cannot offer user support, we'll try to do our best to address it, as questions often lead to better documentation or the discovery of bugs.
22 | 
23 | Want to ask a question in private? Contact the package maintainer by [email][email].
24 | 
25 | ### Propose an idea 💡
26 | 
27 | Have an idea for a new **pewtils** feature? Take a look at the [issue list][issues] to see if it isn't included or suggested yet. If not, suggest your idea as an [issue on GitHub][new_issue]. While we can't promise to implement your idea, it helps to:
28 | 
29 | * Explain in detail how it would work.
30 | * Keep the scope as narrow as possible.
31 | 
32 | See below if you want to contribute code for your idea as well.
33 | 
34 | ### Report a bug 🐛
35 | 
36 | Using **pewtils** and discovered a bug? That's annoying! Don't let others have the same experience and report it as an [issue on GitHub][new_issue] so we can fix it. A good bug report makes it easier for us to do so, so please include:
37 | 
38 | * Your operating system name and version (e.g. macOS 10.13.6).
39 | * Any details about your local setup that might be helpful in troubleshooting.
40 | * Detailed steps to reproduce the bug.
41 | 
42 | ### Contribute code 📝
43 | 
44 | Care to fix bugs or implement new functionality for **pewtils**? Awesome! 👏 Have a look at the [issue list][issues] and leave a comment on the things you want to work on. When making contributions, please follow the development guidelines below.
45 | 
46 | #### Development guidelines
47 | 
48 | We try to follow the [GitHub flow](https://guides.github.com/introduction/flow/) for development, and we use Python docstrings and [Sphinx](https://www.sphinx-doc.org/en/master/) to document all of our code. 
49 | 
50 | 1. Fork [this repo][repo] and clone it to your computer. To learn more about this process, see [this guide](https://guides.github.com/activities/forking/).
51 | 2. If you have forked and cloned the project before and it has been a while since you worked on it, [pull changes from the original repo](https://help.github.com/articles/merging-an-upstream-repository-into-your-fork/) to your clone by using `git pull upstream master`.
52 | 3. Make your changes:
53 |     * Write your code.
54 |     * Test your code (bonus points for adding unit tests).
55 |     * Document your code (see function documentation above).
56 | 4. If you added unit tests, make sure everything works by running the `python -m unittest tests` command from the root directory of the repository. 
57 | 5. If you added or updated documentation, build a fresh version of the docs by running the `make github` command from the root directory of the repository.
58 | 6. Commit and push your changes.
59 | 7. Submit a [pull request](https://guides.github.com/activities/forking/#making-a-pull-request).


--------------------------------------------------------------------------------
/tests/regex.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | 
  4 | class RegexTests(unittest.TestCase):
  5 | 
  6 |     """
  7 |     To test, navigate to pewtils root folder and run `python -m unittest tests`
  8 |     """
  9 | 
 10 |     def setUp(self):
 11 |         pass
 12 | 
 13 |     def test_url_regex(self):
 14 | 
 15 |         from pewtils.regex import URL_REGEX
 16 | 
 17 |         for val in [
 18 |             "example.com",
 19 |             "www.example.com",
 20 |             "http://example.com",
 21 |             "https://example.com",
 22 |             "https://www.example.com",
 23 |             "example.com/test",
 24 |             "example.com/test?test=test",
 25 |             "http://example.com?test=test&test=test",
 26 |             "https://t.co/example",
 27 |         ]:
 28 |             result = URL_REGEX.findall("test {} test".format(val))
 29 |             self.assertEqual(result[0], val)
 30 | 
 31 |     def test_domain_regex(self):
 32 |         from pewtils.regex import DOMAIN_REGEX
 33 | 
 34 |         for val in ["example.com", "http://example.com"]:
 35 |             result = DOMAIN_REGEX.findall(val)
 36 |             self.assertEqual(result[0], "example.com")
 37 |         for val in [
 38 |             "test.example.com",
 39 |             "http://test.example.com",
 40 |             "https://www.test.example.com",
 41 |             "test.example.com/test",
 42 |         ]:
 43 |             result = DOMAIN_REGEX.findall(val)
 44 |             self.assertEqual(result[0], "test.example.com")
 45 | 
 46 |     def test_http_regex(self):
 47 | 
 48 |         from pewtils.regex import HTTP_REGEX
 49 | 
 50 |         for val in [
 51 |             "http://example.com",
 52 |             "https://example.com",
 53 |             "https://www.example.com",
 54 |             "http://example.com?test=test&test=test",
 55 |         ]:
 56 |             result = HTTP_REGEX.match(val)
 57 |             self.assertIsNotNone(result)
 58 | 
 59 |         for val in [
 60 |             "example.com",
 61 |             "www.example.com",
 62 |             "example.com/test",
 63 |             "example.com/test?test=test",
 64 |         ]:
 65 |             result = HTTP_REGEX.match(val)
 66 |             self.assertIsNone(result)
 67 | 
 68 |     def test_us_dollar_regex(self):
 69 |         from pewtils.regex import US_DOLLAR_REGEX
 70 | 
 71 |         for val in [
 72 |             "$1.00",
 73 |             "$10",
 74 |             "$10,000",
 75 |             "$999,999",
 76 |             "$1,000,000,000",
 77 |             "$1,000,000,000.00",
 78 |         ]:
 79 |             result = US_DOLLAR_REGEX.findall(val)
 80 |             self.assertEqual(result[0], val)
 81 | 
 82 |         for val in ["$01,000", "$01", "$1a0,000", "$.00", "$01.00"]:
 83 |             result = US_DOLLAR_REGEX.findall(val)
 84 |             self.assertEqual(len(result), 0)
 85 | 
 86 |     def test_titleword_regex(self):
 87 |         from pewtils.regex import TITLEWORD_REGEX
 88 | 
 89 |         for val, expected in [
 90 |             ("this is a Test", ["Test"]),
 91 |             ("testing One two three", ["One"]),
 92 |             ("testing One Two Three", ["One", "Two", "Three"]),
 93 |             ("testing One1 Two2 Three3", []),
 94 |             ("testing one two three", []),
 95 |         ]:
 96 |             result = TITLEWORD_REGEX.findall(val)
 97 |             self.assertEqual(result, expected)
 98 | 
 99 |     def test_number_regex(self):
100 |         from pewtils.regex import NUMBER_REGEX
101 | 
102 |         for val, expected in [
103 |             ("one 2 three", ["2"]),
104 |             ("1234", ["1234"]),
105 |             (" 12 345 ", ["12", "345"]),
106 |             ("one2three", []),
107 |         ]:
108 |             result = NUMBER_REGEX.findall(val)
109 |             self.assertEqual(result, expected)
110 | 
111 |     def test_nonalpha_regex(self):
112 |         from pewtils.regex import NONALPHA_REGEX
113 | 
114 |         for val, expected in [
115 |             ("abc$efg", ["$"]),
116 |             ("one ^%& two", [" ", "^", "%", "&", " "]),
117 |             ("one two three", [" ", " "]),
118 |             ("1234", []),
119 |         ]:
120 |             result = NONALPHA_REGEX.findall(val)
121 |             self.assertEqual(result, expected)
122 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pewtils
 2 | 
 3 | Pewtils is a package of useful programming utilities developed at the Pew Research Center over the years. Most of the functions in Pewtils can be found in the root module, while a handful of submodules contain more specialized utilities for working with files, web resources, and regular expressions.
 4 | 
 5 | ## Installation
 6 | 
 7 | To install, you can use `pip`: 
 8 | 
 9 |     pip install git+https://github.com/pewresearch/pewtils#egg=pewtils
10 | 
11 | Or you can install from source: 
12 | 
13 |     git clone https://github.com/pewresearch/pewtils.git
14 |     cd pewtils
15 |     python setup.py install
16 |     
17 | ### Installation Troubleshooting
18 |  
19 | #### Using 64-bit Python
20 | 
21 | Some of our libraries require the use of 64-bit Python. If you encounter errors during installation that are related to missing libraries, you may be using 32-bit Python. We recommend that you uninstall this version and switch to a 64-bit version instead. On Windows, these will be marked with `x86-64`; you can find the latest 64-bit versions of Python [here](https://www.python.org/downloads).
22 | 
23 | #### Installing ssdeep
24 | 
25 | ssdeep is an optional dependency that can be used by the `get_hash` function in Pewtils. Installation instructions for various Linux distributions can be found in the library's [documentation](https://python-ssdeep.readthedocs.io/en/latest/installation.html). The ssdeep Python library is not currently compatible with Windows. Installing ssdeep on Mac OS may involve a few additional steps, detailed below:
26 | 
27 | 1. Install Homebrew
28 | 2. Install xcode
29 |     ```
30 |     xcode-select --install
31 |     ```
32 | 3. Install system dependencies
33 |     ```
34 |     brew install pkg-config libffi libtool automake
35 |     ln -s /usr/local/bin/glibtoolize /usr/local/bin/libtoolize
36 |     ```
37 | 4. Install ssdeep with an additional flag to build the required libraries
38 | 	```
39 |     BUILD_LIB=1 pip install ssdeep
40 |     ```
41 | 5. If step 4 fails, you may need to redirect your system to the new libraries by setting the following flags:
42 |     ```
43 |     export LIBTOOL=`which glibtool`
44 |     export LIBTOOLIZE=`which glibtoolize`
45 |     ```
46 |    Do this and try step 4 again.
47 | 6. Now you should be able to run the main installation process detailed above.
48 | 
49 | ## Documentation
50 | 
51 | Please refer to the [official documentation](https://pewresearch.github.io/pewtils/) for information on how to use this package.
52 | 
53 | ## Use Policy 
54 | 
55 | In addition to the [license](https://github.com/pewresearch/pewtils/blob/master/LICENSE), Users must abide by the following conditions:
56 | 
57 | - User may not use the Center's logo
58 | - User may not use the Center's name in any advertising, marketing or promotional materials.
59 | - User may not use the licensed materials in any manner that implies, suggests, or could otherwise be perceived as attributing a particular policy or lobbying objective or opinion to the Center, or as a Center endorsement of a cause, candidate, issue, party, product, business, organization, religion or viewpoint.
60 | 
61 | ## Issues and Pull Requests
62 | 
63 | This code is provided as-is for use in your own projects. You are free to submit issues and pull requests with any questions or suggestions you may have. We will do our best to respond within a 30-day time period.
64 | 
65 | ## Recommended Package Citation
66 | 
67 | Pew Research Center, 2020, "pewtils" Available at: github.com/pewresearch/pewtils
68 | 
69 | ## Acknowledgements
70 | 
71 | The following authors contributed to this repository:
72 | 
73 | - Patrick van Kessel
74 | - Regina Widjaya
75 | - Skye Toor
76 | - Emma Remy
77 | - Onyi Lam
78 | - Brian Broderick
79 | - Galen Stocking
80 | - Dennis Quinn
81 | 
82 | ## About Pew Research Center
83 | 
84 | Pew Research Center is a nonpartisan fact tank that informs the public about the issues, attitudes and trends shaping the world. It does not take policy positions. The Center conducts public opinion polling, demographic research, content analysis and other data-driven social science research. It studies U.S. politics and policy; journalism and media; internet, science and technology; religion and public life; Hispanic trends; global attitudes and trends; and U.S. social and demographic trends. All of the Center's reports are available at [www.pewresearch.org](http://www.pewresearch.org). Pew Research Center is a subsidiary of The Pew Charitable Trusts, its primary funder.
85 | 
86 | ## Contact
87 | 
88 | For all inquiries, please email info@pewresearch.org. Please be sure to specify your deadline, and we will get back to you as soon as possible. This email account is monitored regularly by Pew Research Center Communications staff.
89 | 
90 | 


--------------------------------------------------------------------------------
/docs_source/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os, sys
 16 | 
 17 | sys.path.insert(0, os.path.abspath(".."))
 18 | 
 19 | 
 20 | # -- Project information -----------------------------------------------------
 21 | 
 22 | project = "pewtils"
 23 | copyright = "2020, Pew Research Center"
 24 | author = "Pew Research Center"
 25 | 
 26 | # The short X.Y version
 27 | version = ""
 28 | # The full version, including alpha/beta/rc tags
 29 | release = "1.1.6.dev1"
 30 | 
 31 | 
 32 | # -- General configuration ---------------------------------------------------
 33 | 
 34 | # If your documentation needs a minimal Sphinx version, state it here.
 35 | #
 36 | # needs_sphinx = '1.0'
 37 | 
 38 | # Add any Sphinx extension module names here, as strings. They can be
 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 40 | # ones.
 41 | extensions = [
 42 |     "sphinx.ext.autodoc",
 43 |     "sphinx.ext.intersphinx",
 44 |     "sphinx.ext.coverage",
 45 |     "sphinx.ext.mathjax",
 46 |     "sphinx.ext.ifconfig",
 47 |     "sphinx.ext.viewcode",
 48 |     "sphinx.ext.githubpages",
 49 |     "autodocsumm",
 50 | ]
 51 | 
 52 | # Add any paths that contain templates here, relative to this directory.
 53 | templates_path = ["_templates"]
 54 | 
 55 | # The suffix(es) of source filenames.
 56 | # You can specify multiple suffix as a list of string:
 57 | #
 58 | # source_suffix = ['.rst', '.md']
 59 | source_suffix = ".rst"
 60 | 
 61 | # The master toctree document.
 62 | master_doc = "index"
 63 | 
 64 | # The language for content autogenerated by Sphinx. Refer to documentation
 65 | # for a list of supported languages.
 66 | #
 67 | # This is also used if you do content translation via gettext catalogs.
 68 | # Usually you set "language" from the command line for these cases.
 69 | language = None
 70 | 
 71 | # List of patterns, relative to source directory, that match files and
 72 | # directories to ignore when looking for source files.
 73 | # This pattern also affects html_static_path and html_extra_path.
 74 | exclude_patterns = []
 75 | 
 76 | # The name of the Pygments (syntax highlighting) style to use.
 77 | pygments_style = None
 78 | 
 79 | # Prevent autodoc to sort document members alphabetically
 80 | autodoc_member_order = "bysource"
 81 | 
 82 | # If true, the current module name will be prepended to all description
 83 | # unit titles (such as .. function::).
 84 | add_module_names = False
 85 | 
 86 | # -- Options for HTML output -------------------------------------------------
 87 | 
 88 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 89 | # a list of builtin themes.
 90 | #
 91 | html_theme = "sphinx_rtd_theme"
 92 | 
 93 | # Theme options are theme-specific and customize the look and feel of a theme
 94 | # further.  For a list of options available for each theme, see the
 95 | # documentation.
 96 | #
 97 | html_theme_options = {"navigation_depth": 3}
 98 | 
 99 | # Add any paths that contain custom static files (such as style sheets) here,
100 | # relative to this directory. They are copied after the builtin static files,
101 | # so a file named "default.css" will overwrite the builtin "default.css".
102 | html_static_path = ["_static"]
103 | html_context = {
104 |     "css_files": ["_static/theme_overrides.css"]  # override wide tables in RTD theme
105 | }
106 | 
107 | # Custom sidebar templates, must be a dictionary that maps document names
108 | # to template names.
109 | #
110 | # The default sidebars (for documents that don't match any pattern) are
111 | # defined by theme itself.  Builtin themes are using these templates by
112 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
113 | # 'searchbox.html']``.
114 | #
115 | # html_sidebars = {}
116 | 
117 | 
118 | # -- Options for HTMLHelp output ---------------------------------------------
119 | 
120 | # Output file base name for HTML help builder.
121 | htmlhelp_basename = "pewtilsdoc"
122 | 
123 | 
124 | # -- Options for LaTeX output ------------------------------------------------
125 | 
126 | latex_elements = {
127 |     # The paper size ('letterpaper' or 'a4paper').
128 |     #
129 |     # 'papersize': 'letterpaper',
130 |     # The font size ('10pt', '11pt' or '12pt').
131 |     #
132 |     # 'pointsize': '10pt',
133 |     # Additional stuff for the LaTeX preamble.
134 |     #
135 |     # 'preamble': '',
136 |     # Latex figure (float) alignment
137 |     #
138 |     # 'figure_align': 'htbp',
139 | }
140 | 
141 | # Grouping the document tree into LaTeX files. List of tuples
142 | # (source start file, target name, title,
143 | #  author, documentclass [howto, manual, or own class]).
144 | latex_documents = [
145 |     (
146 |         master_doc,
147 |         "pewtils.tex",
148 |         "pewtils Documentation",
149 |         "pew research center",
150 |         "manual",
151 |     )
152 | ]
153 | 
154 | 
155 | # -- Options for manual page output ------------------------------------------
156 | 
157 | # One entry per manual page. List of tuples
158 | # (source start file, name, description, authors, manual section).
159 | man_pages = [(master_doc, "pewtils", "pewtils Documentation", [author], 1)]
160 | 
161 | 
162 | # -- Options for Texinfo output ----------------------------------------------
163 | 
164 | # Grouping the document tree into Texinfo files. List of tuples
165 | # (source start file, target name, title, author,
166 | #  dir menu entry, description, category)
167 | texinfo_documents = [
168 |     (
169 |         master_doc,
170 |         "pewtils",
171 |         "pewtils Documentation",
172 |         author,
173 |         "pewtils",
174 |         "One line description of project.",
175 |         "Miscellaneous",
176 |     )
177 | ]
178 | 
179 | 
180 | # -- Options for Epub output -------------------------------------------------
181 | 
182 | # Bibliographic Dublin Core info.
183 | epub_title = project
184 | 
185 | # The unique identifier of the text. This can be a ISBN number
186 | # or the project homepage.
187 | #
188 | # epub_identifier = ''
189 | 
190 | # A unique identification for the text.
191 | #
192 | # epub_uid = ''
193 | 
194 | # A list of files that should not be packed into the epub file.
195 | epub_exclude_files = ["search.html"]
196 | 
197 | 
198 | # -- Extension configuration -------------------------------------------------
199 | 
200 | 
201 | def setup(app):
202 |     app.add_css_file("theme_overrides.css")
203 | 


--------------------------------------------------------------------------------
/pewtils/vanity_link_shorteners.csv:
--------------------------------------------------------------------------------
  1 | shortener,expanded,historical
  2 | 12ne.ws,12news.com,0
  3 | 2wsb.tv,wsbtv.com,0
  4 | ab.co,abc.net.au,0
  5 | abcn.ws,abcnews.com,1
  6 | actb.lu,actblue.com,1
  7 | aje.io,aljazeera.com,0
  8 | ampr.gs,americanprogress.org,0
  9 | amzn.com,amazon.com,0
 10 | amzn.to,amazon.com,0
 11 | apne.ws,apnews.com,0
 12 | armytim.es,armytimes.com,0
 13 | atxne.ws,statesman.com,0
 14 | ayre.to,calvinayre.com,1
 15 | azc.cc,azcentral.com,0
 16 | bayareane.ws,eastbaytimes.com,0
 17 | bbc.in,bbc.co.uk,0
 18 | bcove.me,bcove.me,0
 19 | bernie.to,berniesanders.com,0
 20 | bizj.us,bizjournals.com,0
 21 | ble.ac,bleacherreport.com,0
 22 | bloom.bg,bloomberg.com,0
 23 | bloombg.org,bloomberg.org,0
 24 | bos.gl,w.bos.gl,0
 25 | brook.gs,brookings.edu,0
 26 | bsun.md,baltimoresun.com,0
 27 | buswk.co,businessweek.com,1
 28 | bv.ms,bloomberg.com,0
 29 | bzfd.it,buzzfeed.com,0
 30 | c-spanvideo.org,c-span.org,0
 31 | cbsloc.al,cbslocal.com,0
 32 | cbsn.ws,cbsnews.com,0
 33 | chn.ge,change.org,0
 34 | chng.it,change.org,0
 35 | cjky.it,courier-journal.com,1
 36 | cmplx.co,complex.com,0
 37 | cnb.cx,cnbc.com,0
 38 | cnet.co,cnet.com,0
 39 | cnn.it,cnn.com,0
 40 | cnnmon.ie,cnn.com,0
 41 | comsen.se,commonsensemedia.org,0
 42 | conta.cc,constantcontact.com,0
 43 | cour.at,courant.com,1
 44 | cs.pn,c-span.org,1
 45 | csmo.us,cosmopolitan.com,0
 46 | ctrylv.co,countryliving.com,0
 47 | d-news.co,dallasnews.com,1
 48 | dai.ly,dailymotion.com,0
 49 | dailym.ai,dailymail.co.uk,0
 50 | dailysign.al,dailysignal.com,0
 51 | dbtg.tv,bundestag.de,1
 52 | de.gov,delaware.gov,0
 53 | delonline.us,delawareonline.com,0
 54 | detne.ws,detroitnews.com,0
 55 | dlsh.it,delish.com,0
 56 | dmreg.co,desmoinesregister.com,0
 57 | dot.gov,transportation.gov,1
 58 | dpo.st,denverpost.com,0
 59 | econ.st,economist.com,0
 60 | ellemag.co,elle.com,0
 61 | engt.co,engadget.com,0
 62 | entm.ag,entrepreneur.com,0
 63 | es.pn,espn.com,0
 64 | esqr.co,esquire.com,0
 65 | ewar.ren,elizabethwarren.com,1
 66 | f-st.co,fastcompany.com,0
 67 | fanda.co,fandango.com,1
 68 | fb.com,facebook.com,0
 69 | fb.me,facebook.com,0
 70 | fdrl.st,fdrl.st,0
 71 | flic.kr,flic.kr,0
 72 | fmeq.co,fmeq.co,0
 73 | fpa.ac,foodpolicyaction.org,0
 74 | fxn.ws,foxnews.com,0
 75 | g.co,google.com,1
 76 | gizmo.do,gizmodo.com,0
 77 | glblctzn.me,globalcitizen.org,0
 78 | glo.bo,globo.com,0
 79 | gma.abc,goodmorningamerica.com,0
 80 | goldenisles.news,thebrunswicknews.com,0
 81 | gph.is,giphy.com,0
 82 | grnol.co,greenvilleonline.com,0
 83 | harlem.in,harlemunited.org,1
 84 | hbaz.co,harpersbazaar.com,0
 85 | herit.ag,heritage.org,0
 86 | hill.cm,thehill.com,0
 87 | histv.co,history.com,0
 88 | hrc.io,hillaryclinton.com,1
 89 | hrld.us,miamiherald.com,1
 90 | hsbu.us,housebeautiful.com,0
 91 | hucka.be,mikehuckabee.com,1
 92 | huff.lv,huffingtonpost.com,1
 93 | huff.to,huffingtonpost.com,0
 94 | huffp.st,huffingtonpost.com,0
 95 | huffpost.com,huffingtonpost.com,0
 96 | hulu.tv,hulu.com,0
 97 | icont.ac,icont.ac,0
 98 | ift.tt,ifttt.com,0
 99 | il.gov,illinois.gov,0
100 | ind.pn,independent.co.uk,0
101 | indy.st,indystar.com,0
102 | injo.com,ijr.com,1
103 | instagr.am,instagram.com,0
104 | interc.pt,theintercept.com,0
105 | itun.es,itunes.com,0
106 | jrnl.ie,thejournal.ie,0
107 | jwatch.us,judicialwatch.org,1
108 | kpbs.us,kpbs.org,0
109 | kstp.mn,kstp.com,1
110 | ky.gov,kentucky.gov,0
111 | l-bc.co,lbc.co.uk,0
112 | lat.ms,latimes.com,0
113 | linkd.in,linkedin.com,0
114 | lnkd.in,linkedin.com,0
115 | lp.ca,lapresse.ca,0
116 | m.me,messenger.com,0
117 | ma.us,state.ma.us,1
118 | mailchi.mp,mailchimp.com,0
119 | mapq.st,mapquest.com,0
120 | marinetim.es,marinecorpstimes.com,0
121 | md.us,state.md.us,1
122 | meetu.ps,meetup.com,0
123 | mn.us,state.mn.us,1
124 | mol.im,dailymail.co.uk,0
125 | mrie.cl,marieclaire.com,0
126 | mt.gov,montana.gov,0
127 | mycj.co,mycentraljersey.com,0
128 | n.pr,npr.org,0
129 | natl.io,nationalreview.com,1
130 | natl.io,nationalreview.com,1
131 | natl.re,nationalreview.com,1
132 | navtim.es,navytimes.com,0
133 | nbc4i.co,nbc4i.com,0
134 | nbcbay.com,nbcbayarea.com,0
135 | nbcchi.com,nbcchicago.com,0
136 | nbcct.co,nbcconnecticut.com,0
137 | nbcnews.to,nbcnews.com,0
138 | nc1.tv,newscenter1.tv,0
139 | ne.gov,nebraska.gov,0
140 | newspr.es,news-press.com,0
141 | nj-ne.ws,nj.com,0
142 | njersy.co,northjersey.com,0
143 | nm.us,state.nm.us,1
144 | nwk.ee,europe.newsweek.com,1
145 | nws.mx,newsmax.com,1
146 | nwsdy.li,newsday.com,0
147 | ny.us,state.ny.us,1
148 | nydn.us,nydailynews.com,0
149 | nyer.cm,newyorker.com,1
150 | nyp.st,nypost.com,0
151 | nyti.ms,nytimes.com,0
152 | ofa.bo,ofa.us,1
153 | oh.us,state.oh.us,1
154 | ohne.ws,newarkadvocate.com,0
155 | on.fb.me,facebook.com,0
156 | onforb.es,forbes.com,0
157 | p4a.us,peteforamerica.com,1
158 | pa.us,state.pa.us,1
159 | pbpo.st,palmbeachpost.com,1
160 | pdora.co,pandora.com,0
161 | peoplem.ag,people.com,0
162 | pew.org,pewtrusts.org,0
163 | pewrsr.ch,pewresearch.org,0
164 | politi.co,politico.com,0
165 | prn.to,prnewswire.com,0
166 | propub.li,propublica.org,0
167 | ptrtvoic.es,patriotvoices.com,1
168 | r29.co,refinery29.com,0
169 | read.bi,businessinsider.com,0
170 | redd.it,reddit.com,0
171 | reut.rs,reuters.com,0
172 | rlm.ag,magnetmail.net,0
173 | rol.st,rollingstone.com,0
174 | roll.cl,cqrollcall.com,1
175 | rub.io,marcorubio.com,1
176 | sacb.ee,sacbee.com,1
177 | sc.mp,scmp.com,0
178 | scne.ws,thestate.com,1
179 | sen.gov,senate.gov,0
180 | sfex.news,sfexaminer.com,0
181 | slate.me,slate.com,0
182 | spkrryan.us,speaker.gov,1
183 | spon.de,spiegel.de,0
184 | spoti.fi,spotify.com,0
185 | st.news,seattletimes.com,1
186 | stjr.nl,statesmanjournal.com,0
187 | strib.mn,startribune.com,0
188 | tannos.mx,yarithtannos.com,0
189 | tgam.ca,theglobeandmail.com,0
190 | theatln.tc,theatlantic.com,0
191 | thebea.st,thedailybeast.com,0
192 | thegaz.co,thegazette.com,0
193 | thkpr.gs,thinkprogress.org,1
194 | thr.cm,hollywoodreporter.com,0
195 | thr.cm,hollywoodreporter.com,1
196 | ti.me,time.com,0
197 | tl.gd,twitlonger.com,0
198 | tlmdo.co,telemundo.com,0
199 | tmz.me,tmz.com,0
200 | tnne.ws,tennessean.com,1
201 | tnw.to,thenextweb.com,0
202 | tonyr.co,tonyrobbins.com,1
203 | trib.in,chicagotribune.com,1
204 | tun.in,tunein.com,0
205 | tusconne.ws,kold.com,1
206 | twimg.com,twitter.com,0
207 | twnctry.co,towndandcountrymag.com,0
208 | tws.io,weeklystandard.com,1
209 | txnne.ws,thetexan.news,0
210 | txpo.li,texaspolicy.com,1
211 | u.pw,upworthy.com,0
212 | uni.vi,univision.com,1
213 | usat.ly,usatoday.com,0
214 | usg.lc,usglc.org,1
215 | usm.ag,usmagazine.com,0
216 | virg.in,virgin.com,0
217 | vntyfr.com,vanityfair.com,0
218 | vogue.cm,vogue.com,0
219 | vpr.net,vpr.org,0
220 | wapo.st,washingtonpost.com,1
221 | washex.am,washingtonexaminer.com,0
222 | wb.md,webmd.com,0
223 | wbur.fm,wbur.orb,0
224 | wdtn.tv,wdtn.com,0
225 | wef.ch,weforum.org,0
226 | wh.gov,whitehouse.gov,0
227 | wink.news,winknews.com,0
228 | wpo.st,washingtonpost.com,1
229 | wrd.cm,wired.com,0
230 | wtim.es,washingtontimes.com,0
231 | wtr.ie,water.ie,0
232 | wtrne.ws,timesrecordnews.com,0
233 | wwrld.us,wenatcheeworld.com,0
234 | wxch.nl,weather.com,0
235 | yhoo.it,yahoo.com,0
236 | youtu.be,youtube.com,0
237 | 


--------------------------------------------------------------------------------
/docs_source/examples.rst:
--------------------------------------------------------------------------------
  1 | **************
  2 | Examples
  3 | **************
  4 | 
  5 | Check for null values
  6 | -----------------------------------------------------
  7 | 
  8 | You can use the :py:func:`pewtils.is_null` and :py:func:`pewtils.is_not_null` to quickly check for a \
  9 | variety of common null values.
 10 | 
 11 | .. code-block:: python
 12 | 
 13 |     from pewtils import is_null
 14 |     from pewtils import is_not_null
 15 |     import numpy as np
 16 | 
 17 |     >>> is_null(None)
 18 |     True
 19 |     >>> is_null("None")
 20 |     True
 21 |     >>> is_null("nan")
 22 |     True
 23 |     >>> is_null("")
 24 |     True
 25 |     >>> is_null(" ")
 26 |     True
 27 |     >>> is_null("NaN")
 28 |     True
 29 |     >>> is_null("none")
 30 |     True
 31 |     >>> is_null("NONE")
 32 |     True
 33 |     >>> is_null("n/a")
 34 |     True
 35 |     >>> is_null("N/A")
 36 |     True
 37 |     >>> is_null(np.nan)
 38 |     True
 39 |     >>> is_null("-9", custom_nulls=["-9"])
 40 |     True
 41 |     >>> is_null("Hello World")
 42 |     False
 43 |     >>> is_null(0.0)
 44 |     False
 45 | 
 46 | Collapse documents into context-sensitive hashes
 47 | -----------------------------------------------------
 48 | 
 49 | When working with large documents, you can use the :py:func:`pewtils.get_hash` function to convert \
 50 | them into a variety of different hashed representations. By default, this function uses SSDEEP, which \
 51 | produced context-sensitive hashes that can be useful for searching for similar documents.
 52 | 
 53 | .. code-block:: python
 54 | 
 55 |     from pewtils import get_hash
 56 | 
 57 |     >>> doc1 = "This is a document."
 58 |     >>> doc2 = "This is a document. But this one is longer."
 59 |     >>> get_hash(doc1)
 60 |     '3:hMCE+RL:hu+t'
 61 |     >>> get_hash(doc2)
 62 |     '3:hMCE+RGreCQHCAb:hu+0rLkb'
 63 |     # Notice that both hashes start the same way, corresponding to their overlapping text.
 64 | 
 65 | Flatten nested lists
 66 | -----------------------------------------------------
 67 | 
 68 | Easily flatten lists of lists:
 69 | 
 70 | .. code-block:: python
 71 | 
 72 |     from pewtils import flatten_list
 73 | 
 74 |     >>> nested_lists = [[1, 2, 3], [4, 5, 6]]
 75 |     >>> flatten_list(nested_lists)
 76 |     [1, 2, 3, 4, 5, 6]
 77 | 
 78 | Recursively update dictionaries and object attributes
 79 | -----------------------------------------------------
 80 | 
 81 | Map a dictionary or object onto another version of itself to update overlapping attributes:
 82 | 
 83 | .. code-block:: python
 84 | 
 85 |     from pewtils import recursive_update
 86 | 
 87 |     class TestObject(object):
 88 |         def __init__(self, value):
 89 |             self.value = value
 90 |             self.dict = {"obj_key": "original"}
 91 |         def __repr__(self):
 92 |             return("TestObject(value='{}', dict={})".format(self.value, self.dict))
 93 | 
 94 |     original = {
 95 |         "object": TestObject("original"),
 96 |         "key1": {"key2": "original"}
 97 |     }
 98 |     update = {
 99 |         "object": {"value": "updated", "dict": {"obj_key": "updated"}},
100 |         "key1": {"key3": "new"}
101 |     }
102 | 
103 |     >>> recursive_update(original, update)
104 |     {'object': TestObject(value='updated', dict={'obj_key': 'updated'}),
105 |      'key1': {'key2': 'original', 'key3': 'new'}}
106 | 
107 | 
108 | Efficiently map a function onto a Pandas Series
109 | -----------------------------------------------------
110 | 
111 | Avoid repeating database lookups or expensive computations when applying a function to a Pandas \
112 | Series by using the :py:func:`pewtils.cached_series_mapper` function, which caches the results \
113 | for each value in the series as it iterates.
114 | 
115 | .. code-block:: python
116 | 
117 |     import pandas as pd
118 |     from pewtils import cached_series_mapper
119 | 
120 |     values = ["value"]*10
121 |     def my_function(x):
122 |         print(x)
123 |         return x
124 | 
125 |     df = pd.DataFrame(values, columns=['column'])
126 |     >>> mapped = df['column'].map(my_function)
127 |     value
128 |     value
129 |     value
130 |     value
131 |     value
132 |     value
133 |     value
134 |     value
135 |     value
136 |     value
137 |     >>> mapped = cached_series_mapper(df['column'], my_function)
138 |     value
139 | 
140 | Read and write data in a variety of formats
141 | -----------------------------------------------------
142 | 
143 | The :py:class:`pewtils.io.FileHandler` class lets you easily read and write files in a variety of \
144 | formats with minimal code, and it has support for Amazon S3 too:
145 | 
146 | .. code-block:: python
147 | 
148 |     from pewtils.io import FileHandler
149 | 
150 |     >>> h = FileHandler("./", use_s3=False)  # current local folder
151 |     >>> df = h.read("my_csv", format="csv")
152 |     # Do something and save to Excel
153 |     >>> h.write("my_new_csv", df, format="xlsx")
154 | 
155 |     >>> my_data = [{"key": "value"}]
156 |     >>> h.write("my_data", my_data, format="json")
157 | 
158 |     >>> my_data = ["a", "python", "list"]
159 |     >>> h.write("my_data", my_data, format="pkl")
160 | 
161 |     # To read/write to an S3 bucket, simply pass your credentials
162 |     >>> h = FileHandler("/my_folder", use_s3=True, aws_access="12345", aws_secret="67890", bucket="my-bucket")
163 |     # The FileHandler can also detect your tokens directly from your environment
164 |     # Just set the environment variables AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and S3_BUCKET
165 | 
166 | Quickly extract text from raw HTML
167 | -----------------------------------------------------
168 | 
169 | It's not always perfect, but the :py:func:`pewtils.http.strip_html` function can often be used to \
170 | extract most of the valuable text data from a raw HTML documents - useful for quick exploratory \
171 | analysis after scraping a bunch of webpages.
172 | 
173 | .. code-block:: python
174 | 
175 |     from pewtils.http import strip_html
176 | 
177 |     >>> my_html = "<html><head>Header text</head><body>Body text</body></html>"
178 |     >>> strip_html(my_html)
179 |     'Header text\n\nBody text'
180 | 
181 | Standardize URLs and extract domains
182 | -----------------------------------------------------
183 | 
184 | The :py:func:`pewtils.http.canonical_link` function is our best attempt at resolving URLs to their \
185 | true form: it follows shortened URLs, removes unnecessary GET parameters, and tries to avoid returning \
186 | incorrect 404 pages in favor of the most informative last-known version of a URL. Once links have been \
187 | standardized, you can also use the :py:func:`pewtils.http.extract_domain_from_url` function to pull \
188 | out domains and subdomains.
189 | 
190 | .. code-block:: python
191 | 
192 |     from pewtils.http import canonical_link
193 | 
194 |     >>> canonical_link("https://pewrsr.ch/2lxB0EX?unnecessary_param=1")
195 |     "https://www.pewresearch.org/interactives/how-does-a-computer-see-gender/"
196 | 
197 |     from pewtils.http import extract_domain_from_url
198 | 
199 |     >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=False)
200 |     "bbc.co.uk"
201 |     >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=True)
202 |     "forums.bbc.co.uk"
203 | 


--------------------------------------------------------------------------------
/tests/http.py:
--------------------------------------------------------------------------------
  1 | import unittest, re
  2 | 
  3 | 
  4 | class HTTPTests(unittest.TestCase):
  5 |     """
  6 |     To test, navigate to pewtils root folder and run `python -m unittest tests`
  7 |     """
  8 | 
  9 |     def setUp(self):
 10 |         pass
 11 | 
 12 |     def test_hash_url(self):
 13 |         from pewtils.http import hash_url
 14 | 
 15 |         url = hash_url("http://www.example.com")
 16 |         self.assertEqual(url, "7c1767b30512b6003fd3c2e618a86522")
 17 |         url = hash_url("www.example.com")
 18 |         self.assertEqual(url, "7c1767b30512b6003fd3c2e618a86522")
 19 | 
 20 |     def test_strip_html(self):
 21 |         # example.html taken from example.com on 3/5/19
 22 |         from contextlib import closing
 23 | 
 24 |         with closing(open("tests/files/example.html", "r")) as input:
 25 |             html = input.read()
 26 |         from pewtils.http import strip_html
 27 | 
 28 |         stripped_html = strip_html(html, simple=False)
 29 |         stripped_simple_html = strip_html(html, simple=True)
 30 |         # with closing(open("tests/files/example_stripped.html", "w")) as output:
 31 |         #     output.write(stripped_html)
 32 |         # with closing(open("tests/files/example_stripped_simple.html", "w")) as output:
 33 |         #     output.write(stripped_simple_html)
 34 | 
 35 |         with closing(open("tests/files/example_stripped.html", "r")) as input:
 36 |             text = input.read()
 37 |             self.assertEqual(text, stripped_html)
 38 |         with closing(open("tests/files/example_stripped_simple.html", "r")) as input:
 39 |             text = input.read()
 40 |             self.assertEqual(text, stripped_simple_html)
 41 | 
 42 |     def test_canonical_link(self):
 43 | 
 44 |         from pewtils.http import canonical_link
 45 | 
 46 |         user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.3; rv:88.0) Gecko/20100101 Firefox/88.0"
 47 | 
 48 |         for original_url, canonical_url in [
 49 |             (
 50 |                 "https://nbcnews.to/2Yc5JVz",
 51 |                 "https://www.nbcnews.com/politics/congress/senate-vote-9-11-first-responders-bill-tuesday-n1032831?cid=sm_npd_nn_tw_ma",
 52 |             ),
 53 |             (
 54 |                 "https://www.google.com/maps/d/viewer?mid=zQ8Zk-5ey-Y8.kgD9Rxu8JCNQ&hl=en&usp=sharing",
 55 |                 "https://www.google.com/maps/d/viewer?mid=1NQVHeBBcVAnz9JwX1frZxX1ZgjY",
 56 |             ),
 57 |             (
 58 |                 "https://pewrsr.ch/2kk3VvY",
 59 |                 "https://www.pewresearch.org/internet/2019/09/05/more-than-half-of-u-s-adults-trust-law-enforcement-to-use-facial-recognition-responsibly/",
 60 |             ),
 61 |             (
 62 |                 "https://pewrsr.ch/2ly4LFE",
 63 |                 "https://www.pewresearch.org/internet/2019/09/05/the-challenges-of-using-machine-learning-to-identify-gender-in-images/",
 64 |             ),
 65 |             (
 66 |                 "https://pewrsr.ch/2lxB0EX",
 67 |                 "https://www.pewresearch.org/interactives/how-does-a-computer-see-gender/",
 68 |             ),
 69 |         ]:
 70 |             result = canonical_link(original_url, user_agent=user_agent, timeout=60)
 71 |             self.assertEqual(result, canonical_url)
 72 | 
 73 |     def test_trim_get_parameters(self):
 74 |         from pewtils.http import trim_get_parameters
 75 | 
 76 |         user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.3; rv:88.0) Gecko/20100101 Firefox/88.0"
 77 |         for original_url, trimmed_url in [
 78 |             ("https://httpbin.org/status/200", "https://httpbin.org/status/200"),
 79 |             (
 80 |                 "https://httpbin.org/status/200?param=1",
 81 |                 "https://httpbin.org/status/200",
 82 |             ),
 83 |         ]:
 84 |             trimmed = trim_get_parameters(
 85 |                 original_url, user_agent=user_agent, timeout=30
 86 |             )
 87 |             self.assertEqual(trimmed, trimmed_url)
 88 | 
 89 |     def test_link_shortener_map(self):
 90 | 
 91 |         import requests
 92 |         from six.moves.urllib import parse as urlparse
 93 |         from pewtils.http import (
 94 |             GENERAL_LINK_SHORTENERS,
 95 |             VANITY_LINK_SHORTENERS,
 96 |             HISTORICAL_VANITY_LINK_SHORTENERS,
 97 |             trim_get_parameters,
 98 |         )
 99 | 
100 |         # These are domains that resolve properly but are alternatives to a preferred version
101 |         IGNORE_DOMAINS = [
102 |             "ap.org",
103 |             "cnet.co",
104 |             "de.gov",
105 |             "huffpost.com",
106 |             "ky.gov",
107 |             "mt.gov",
108 |             "sen.gov",
109 |             "twimg.com",
110 |         ]
111 | 
112 |         user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.3; rv:88.0) Gecko/20100101 Firefox/88.0"
113 |         self.session = requests.Session()
114 |         self.session.headers.update({"User-Agent": user_agent})
115 |         for k, v in VANITY_LINK_SHORTENERS.items():
116 |             if (
117 |                 k not in HISTORICAL_VANITY_LINK_SHORTENERS.keys()
118 |                 and k not in IGNORE_DOMAINS
119 |             ):
120 |                 try:
121 |                     resp = self.session.head("http://{}".format(k), allow_redirects=True, timeout=10)
122 | 
123 |                 except requests.exceptions.ConnectionError:
124 |                     print(f"Could not resolve short domain (may be historic): {k} (connection error)")
125 |                     resp = None
126 | 
127 |                 if resp:
128 |                     resp_url = trim_get_parameters(resp.url, session=self.session, timeout=10).split("?")[0]
129 | 
130 |                     if k in resp_url:
131 |                         print(f"Short domain resolved unexpectedly (may be historic): {k} (resolved to {resp_url} but expected {v})")
132 | 
133 |                     else:
134 |                         resolved = re.match(
135 |                             "(www[0-9]?\.)?([^:]+)(:\d+$)?",
136 |                             urlparse.urlparse(resp.url).netloc,
137 |                         ).group(2).rstrip('/')
138 |                         resolved = VANITY_LINK_SHORTENERS.get(resolved, resolved)
139 |                         # Vanity domains are often purchased/managed through bit.ly or trib.al, and don't resolve
140 |                         # to their actual website unless paired with an actual page URL; so as long as they resolve
141 |                         # to what we expect, or a generic vanity URL like bit.ly, we'll assume everything's good
142 |                         self.assertTrue(resolved in GENERAL_LINK_SHORTENERS or v in resolved)
143 | 
144 |         self.session.close()
145 | 
146 |     def test_extract_domain_from_url(self):
147 |         from pewtils.http import extract_domain_from_url
148 | 
149 |         for url, domain, include_subdomain, resolve in [
150 |             ("https://pewrsr.ch/2lxB0EX", "pewresearch.org", False, False),
151 |             ("https://pewrsr.ch/2lxB0EX", "pewresearch.org", False, True),
152 |             ("https://nbcnews.to/2Yc5JVz", "nbcnews.com", False, False),
153 |             ("https://nbcnews.to/2Yc5JVz", "nbcnews.com", False, True),
154 |             ("https://news.ycombinator.com", "ycombinator.com", False, False),
155 |             ("https://news.ycombinator.com", "news.ycombinator.com", True, False),
156 |             ("http://forums.bbc.co.uk", "forums.bbc.co.uk", True, False),
157 |             ("http://forums.bbc.co.uk", "bbc.co.uk", False, False),
158 |             ("http://www.worldbank.org.kg/", "worldbank.org.kg", True, False),
159 |             ("http://forums.news.cnn.com/", "forums.news.cnn.com", True, False),
160 |             ("http://forums.news.cnn.com/", "cnn.com", False, False),
161 |         ]:
162 |             extracted_domain = extract_domain_from_url(
163 |                 url, include_subdomain=include_subdomain, resolve_url=resolve
164 |             )
165 |             self.assertEqual(extracted_domain, domain)
166 | 
167 |     def tearDown(self):
168 |         if getattr(self, 'session', None) is not None:
169 |             self.session.close()
170 | 


--------------------------------------------------------------------------------
/tests/io.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import os
  3 | from contextlib import closing
  4 | 
  5 | 
  6 | class IOTests(unittest.TestCase):
  7 |     """
  8 |     To test, navigate to pewtils root folder and run `python -m unittest tests`
  9 |     """
 10 | 
 11 |     def setUp(self):
 12 |         import pandas as pd
 13 | 
 14 |         self.test_df = pd.DataFrame(
 15 |             [{"test": 1}, {"test": 2}, {"test": 3}, {"test": 4}]
 16 |         )
 17 |         self.test_json = {"test1": 1, "test2": 2, "test3": 3, "test4": 4}
 18 |         import json
 19 | 
 20 |         test_json = json.dumps(self.test_json)
 21 |         self.test_json = json.loads(test_json)
 22 | 
 23 |     def test_filehandler_iterate_path(self):
 24 |         from pewtils.io import FileHandler
 25 | 
 26 |         h = FileHandler("tests/files", use_s3=False)
 27 |         files = []
 28 |         for file in h.iterate_path():
 29 |             files.append(file)
 30 |         files = [
 31 |             f
 32 |             for f in files
 33 |             if not f.endswith(".pyc") and f not in ["__pycache__", ".DS_Store"]
 34 |         ]
 35 |         self.assertEqual(
 36 |             sorted(files),
 37 |             sorted(
 38 |                 [
 39 |                     "subfolder",
 40 |                     "__init__.py",
 41 |                     "example.html",
 42 |                     "example_stripped_simple.html",
 43 |                     "json.json",
 44 |                     "example_stripped.html",
 45 |                     "py.py",
 46 |                 ]
 47 |             ),
 48 |         )
 49 | 
 50 |     def test_filehandler_clear_folder(self):
 51 |         from pewtils.io import FileHandler
 52 | 
 53 |         h = FileHandler("tests/files/temp", use_s3=False)
 54 | 
 55 |         with closing(open("tests/files/temp/temp.txt", "wb")) as output:
 56 |             output.write(b"test")
 57 |         h.clear_folder()
 58 |         files = []
 59 |         for file in h.iterate_path():
 60 |             files.append(file)
 61 |         self.assertEqual(len(files), 0)
 62 |         os.rmdir("tests/files/temp")
 63 | 
 64 |     def test_clear_file(self):
 65 |         from pewtils.io import FileHandler
 66 | 
 67 |         h = FileHandler("tests/files/temp", use_s3=False)
 68 |         with closing(open("tests/files/temp/temp.txt", "wb")) as output:
 69 |             output.write(b"test")
 70 |         h.clear_file("temp", format="txt")
 71 |         files = []
 72 |         for file in h.iterate_path():
 73 |             files.append(file)
 74 |         self.assertNotIn("temp.txt", files)
 75 |         self.assertEqual(len(files), 0)
 76 |         os.rmdir("tests/files/temp")
 77 | 
 78 |         h = FileHandler("tests/files/temp", use_s3=False)
 79 |         key = h.get_key_hash("temp")
 80 |         with closing(open("tests/files/temp/{}.txt".format(key), "wb")) as output:
 81 |             output.write(b"test")
 82 |         h.clear_file("temp", format="txt", hash_key=True)
 83 |         files = []
 84 |         for file in h.iterate_path():
 85 |             files.append(file)
 86 |         self.assertNotIn("{}.txt".format(key), files)
 87 |         self.assertEqual(len(files), 0)
 88 |         os.rmdir("tests/files/temp")
 89 | 
 90 |     def test_filehandler_get_key_hash(self):
 91 |         from pewtils.io import FileHandler
 92 | 
 93 |         h = FileHandler("tests/files", use_s3=False)
 94 |         self.assertEqual(
 95 |             h.get_key_hash("temp"),
 96 |             "c51bf90ccb22befa316b7a561fe9d5fd9650180b14421fc6d71bcd57",
 97 |         )
 98 |         self.assertEqual(
 99 |             h.get_key_hash({"key": "value"}),
100 |             "37e13e1116c86a6e9f3f8926375c7cb977ca74d2d598572ced03cd09",
101 |         )
102 | 
103 |     def test_filehandler_get_key_hash_s3(self):
104 |         from pewtils.io import FileHandler
105 | 
106 |         if os.environ.get("S3_BUCKET"):
107 |             h = FileHandler("tests/files", use_s3=True)
108 |             self.assertEqual(
109 |                 h.get_key_hash("temp"),
110 |                 "c51bf90ccb22befa316b7a561fe9d5fd9650180b14421fc6d71bcd57",
111 |             )
112 |             self.assertEqual(
113 |                 h.get_key_hash({"key": "value"}),
114 |                 "37e13e1116c86a6e9f3f8926375c7cb977ca74d2d598572ced03cd09",
115 |             )
116 | 
117 |     def test_filehandler_read_write_pkl(self):
118 |         from pewtils.io import FileHandler
119 | 
120 |         h = FileHandler("tests/files", use_s3=False)
121 |         h.write("temp", self.test_df, format="pkl")
122 |         read = h.read("temp", format="pkl")
123 |         import os
124 | 
125 |         os.unlink("tests/files/temp.pkl")
126 |         self.assertEqual(repr(self.test_df), repr(read))
127 | 
128 |     def test_filehandler_read_write_pkl_s3(self):
129 |         from pewtils.io import FileHandler
130 | 
131 |         if os.environ.get("S3_BUCKET"):
132 |             h = FileHandler("tests/files", use_s3=True)
133 |             h.write("temp", self.test_df, format="pkl")
134 |             read = h.read("temp", format="pkl")
135 |             self.assertEqual(repr(self.test_df), repr(read))
136 | 
137 |     def test_filehandler_read_write_csv(self):
138 |         from pewtils.io import FileHandler
139 | 
140 |         h = FileHandler("tests/files", use_s3=False)
141 |         h.write("temp", self.test_df, format="csv")
142 |         read = h.read("temp", format="csv")
143 |         del read["Unnamed: 0"]
144 |         import os
145 | 
146 |         os.unlink("tests/files/temp.csv")
147 |         self.assertEqual(repr(self.test_df), repr(read))
148 | 
149 |     def test_filehandler_read_write_csv_s3(self):
150 |         from pewtils.io import FileHandler
151 | 
152 |         if os.environ.get("S3_BUCKET"):
153 |             h = FileHandler("tests/files", use_s3=True)
154 |             h.write("temp", self.test_df, format="csv")
155 |             read = h.read("temp", format="csv")
156 |             del read["Unnamed: 0"]
157 |             self.assertEqual(repr(self.test_df), repr(read))
158 | 
159 |     def test_filehandler_read_write_txt(self):
160 |         from pewtils.io import FileHandler
161 | 
162 |         h = FileHandler("tests/files", use_s3=False)
163 |         h.write("temp", "test", format="txt")
164 |         read = h.read("temp", format="txt")
165 |         import os
166 | 
167 |         os.unlink("tests/files/temp.txt")
168 |         self.assertEqual(read, "test")
169 | 
170 |     def test_filehandler_read_write_txt_s3(self):
171 |         from pewtils.io import FileHandler
172 | 
173 |         if os.environ.get("S3_BUCKET"):
174 |             h = FileHandler("tests/files", use_s3=True)
175 |             h.write("temp", "test", format="txt")
176 |             read = h.read("temp", format="txt")
177 |             self.assertEqual(read, "test")
178 | 
179 |     def test_filehandler_read_write_tab(self):
180 |         from pewtils.io import FileHandler
181 | 
182 |         h = FileHandler("tests/files", use_s3=False)
183 |         h.write("temp", self.test_df, format="tab")
184 |         read = h.read("temp", format="tab")
185 |         del read["Unnamed: 0"]
186 |         import os
187 | 
188 |         os.unlink("tests/files/temp.tab")
189 |         self.assertEqual(repr(self.test_df), repr(read))
190 | 
191 |     def test_filehandler_read_write_tab_s3(self):
192 |         from pewtils.io import FileHandler
193 | 
194 |         if os.environ.get("S3_BUCKET"):
195 |             h = FileHandler("tests/files", use_s3=True)
196 |             h.write("temp", self.test_df, format="tab")
197 |             read = h.read("temp", format="tab")
198 |             del read["Unnamed: 0"]
199 |             self.assertEqual(repr(self.test_df), repr(read))
200 | 
201 |     def test_filehandler_read_write_xlsx(self):
202 |         from pewtils.io import FileHandler
203 | 
204 |         h = FileHandler("tests/files", use_s3=False)
205 |         h.write("temp", self.test_df, format="xlsx")
206 |         read = h.read("temp", format="xlsx")
207 |         if "Unnamed: 0" in read.columns:
208 |             del read["Unnamed: 0"]
209 |         import os
210 | 
211 |         os.unlink("tests/files/temp.xlsx")
212 |         self.assertEqual(repr(self.test_df), repr(read))
213 | 
214 |     def test_filehandler_read_write_xlsx_s3(self):
215 |         from pewtils.io import FileHandler
216 | 
217 |         if os.environ.get("S3_BUCKET"):
218 |             h = FileHandler("tests/files", use_s3=True)
219 |             h.write("temp", self.test_df, format="xlsx")
220 |             read = h.read("temp", format="xlsx")
221 |             if "Unnamed: 0" in read.columns:
222 |                 del read["Unnamed: 0"]
223 |             self.assertEqual(repr(self.test_df), repr(read))
224 | 
225 |     def test_filehandler_read_write_xls(self):
226 |         from pewtils.io import FileHandler
227 | 
228 |         h = FileHandler("tests/files", use_s3=False)
229 |         h.write("temp", self.test_df, format="xls")
230 |         read = h.read("temp", format="xls")
231 |         if "Unnamed: 0" in read.columns:
232 |             del read["Unnamed: 0"]
233 |         import os
234 | 
235 |         os.unlink("tests/files/temp.xls")
236 |         self.assertEqual(repr(self.test_df), repr(read))
237 | 
238 |     def test_filehandler_read_write_xl_s3(self):
239 |         from pewtils.io import FileHandler
240 | 
241 |         if os.environ.get("S3_BUCKET"):
242 |             h = FileHandler("tests/files", use_s3=True)
243 |             h.write("temp", self.test_df, format="xls")
244 |             read = h.read("temp", format="xls")
245 |             if "Unnamed: 0" in read.columns:
246 |                 del read["Unnamed: 0"]
247 |             self.assertEqual(repr(self.test_df), repr(read))
248 | 
249 |     def test_filehandler_read_write_dta(self):
250 |         from pewtils.io import FileHandler
251 | 
252 |         h = FileHandler("tests/files", use_s3=False)
253 |         h.write("temp", self.test_df, format="dta")
254 |         read = h.read("temp", format="dta")
255 |         del read["index"]
256 |         import os
257 | 
258 |         os.unlink("tests/files/temp.dta")
259 |         self.assertEqual(repr(self.test_df), repr(read))
260 | 
261 |     def test_filehandler_read_write_dta_s3(self):
262 |         from pewtils.io import FileHandler
263 | 
264 |         if os.environ.get("S3_BUCKET"):
265 |             h = FileHandler("tests/files", use_s3=True)
266 |             h.write("temp", self.test_df, format="dta")
267 |             read = h.read("temp", format="dta")
268 |             del read["index"]
269 |             self.assertEqual(repr(self.test_df), repr(read))
270 | 
271 |     def test_filehandler_read_write_json(self):
272 |         from pewtils.io import FileHandler
273 | 
274 |         h = FileHandler("tests/files", use_s3=False)
275 |         h.write("temp", self.test_json, format="json")
276 |         read = h.read("temp", format="json")
277 |         import os
278 | 
279 |         os.unlink("tests/files/temp.json")
280 |         self.assertEqual(repr(self.test_json), repr(dict(read)))
281 | 
282 |     def test_filehandler_read_write_json_s3(self):
283 |         from pewtils.io import FileHandler
284 | 
285 |         if os.environ.get("S3_BUCKET"):
286 |             h = FileHandler("tests/files", use_s3=True)
287 |             h.write("temp", self.test_json, format="json")
288 |             read = h.read("temp", format="json")
289 |             self.assertEqual(repr(self.test_json), repr(dict(read)))
290 | 
291 |     def tearDown(self):
292 | 
293 |         import os
294 | 
295 |         try:
296 |             os.unlink("tests/files/temp/temp.txt")
297 |         except OSError:
298 |             pass
299 |         for format in ["pkl", "csv", "tab", "txt", "xlsx", "xls", "dta", "json"]:
300 |             try:
301 |                 os.unlink("tests/files/temp.{}".format(format))
302 |             except OSError:
303 |                 pass
304 |         try:
305 |             os.rmdir("tests/files/temp")
306 |         except OSError:
307 |             pass
308 | 
309 |         from pewtils.io import FileHandler
310 | 
311 |         if os.environ.get("S3_BUCKET"):
312 |             h = FileHandler("tests/files", use_s3=True)
313 |             for file in h.iterate_path():
314 |                 if "." in file:
315 |                     filename, format = file.split(".")
316 |                     h.clear_file(filename, format=format)
317 | 


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | 
  3 | 
  4 | class BaseTests(unittest.TestCase):
  5 | 
  6 |     """
  7 |     To test, navigate to pewtils root folder and run `python -m unittest tests`.
  8 |     To assess unit test coverage, run `coverage run -m unittest tests` and then `coverage report -m`.
  9 |     """
 10 | 
 11 |     def setUp(self):
 12 |         pass
 13 | 
 14 |     def test_decode_text(self):
 15 |         class FakeObject(object):
 16 |             def __str__(self):
 17 |                 return "str"
 18 | 
 19 |             def __repr__(self):
 20 |                 return "repr"
 21 | 
 22 |         import datetime
 23 |         import numpy as np
 24 |         from pewtils import decode_text
 25 | 
 26 |         text = decode_text("one two three")
 27 |         self.assertEqual(text, "one two three")
 28 |         # below examples taken from unidecode documentation
 29 |         text = decode_text(u"ko\u017eu\u0161\u010dek")
 30 |         self.assertEqual(text, "kozuscek")
 31 |         text = decode_text(u"30 \U0001d5c4\U0001d5c6/\U0001d5c1")
 32 |         self.assertIn(text, ["30 km/h", "30 /"])
 33 |         # Python 2.7 does not have support for UTF-16 so it will fail on the above
 34 |         text = decode_text(u"\u5317\u4EB0")
 35 |         self.assertEqual(text, "Bei Jing ")
 36 |         text = decode_text(datetime.date(2019, 1, 1))
 37 |         self.assertEqual(text, "2019-01-01")
 38 |         text = decode_text(None)
 39 |         self.assertEqual(text, "")
 40 |         text = decode_text("")
 41 |         self.assertEqual(text, "")
 42 |         text = decode_text(np.nan)
 43 |         self.assertEqual(text, "")
 44 |         text = decode_text(FakeObject())
 45 |         self.assertEqual(text, "str")
 46 | 
 47 |     def test_is_null(self):
 48 | 
 49 |         import numpy as np
 50 |         import pandas as pd
 51 |         from pewtils import is_null, is_not_null
 52 | 
 53 |         for val in [None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"]:
 54 |             self.assertTrue(is_null(val))
 55 |         self.assertTrue(is_null(np.nan))
 56 |         self.assertTrue(is_not_null(0.0))
 57 |         self.assertTrue(is_null("-9", custom_nulls=["-9"]))
 58 |         self.assertTrue(is_null([], empty_lists_are_null=True))
 59 |         self.assertFalse(is_null([], empty_lists_are_null=False))
 60 |         self.assertTrue(is_null(pd.Series(dtype=np.float64), empty_lists_are_null=True))
 61 |         self.assertFalse(is_null(pd.Series(dtype=np.float64), empty_lists_are_null=False))
 62 |         self.assertTrue(is_null(pd.DataFrame(), empty_lists_are_null=True))
 63 |         self.assertFalse(is_null(pd.DataFrame(), empty_lists_are_null=False))
 64 | 
 65 |     def test_recursive_update(self):
 66 |         from pewtils import recursive_update
 67 | 
 68 |         class TestObject(object):
 69 |             def __init__(self, val):
 70 |                 self.val = val
 71 |                 self.val_dict = {"key": "value"}
 72 | 
 73 |         test_obj = TestObject("1")
 74 |         base = {
 75 |             "level1": {"level2": {"val2": "test2"}, "val1": "test1", "val2": test_obj}
 76 |         }
 77 |         update = {
 78 |             "level1": {
 79 |                 "level2": {"val2": "test123456"},
 80 |                 "val1": "test123",
 81 |                 "val2": {"val": "2", "val_dict": {"key": "new_value"}},
 82 |                 "val3": {"test": "test"},
 83 |             }
 84 |         }
 85 |         result = recursive_update(base, update)
 86 |         self.assertEqual(result["level1"]["level2"]["val2"], "test123456")
 87 |         self.assertEqual(result["level1"]["val1"], "test123")
 88 |         self.assertEqual(result["level1"]["val2"].val, "2")
 89 |         self.assertEqual(result["level1"]["val2"].val_dict["key"], "new_value")
 90 |         self.assertEqual(result["level1"]["val3"]["test"], "test")
 91 | 
 92 |     def test_chunk_list(self):
 93 |         from pewtils import chunk_list
 94 | 
 95 |         test = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 96 |         chunked = [c for c in chunk_list(test, 3)]
 97 |         self.assertEqual(len(chunked), 4)
 98 |         self.assertEqual(chunked[-1], [10])
 99 | 
100 |     def test_extract_json_from_folder(self):
101 |         from pewtils import extract_json_from_folder
102 | 
103 |         results = extract_json_from_folder(
104 |             "tests/files", include_subdirs=False, concat_subdir_names=False
105 |         )
106 |         self.assertEqual(results, {"json": {u"test_val": 1}})
107 |         results = extract_json_from_folder(
108 |             "tests/files", include_subdirs=True, concat_subdir_names=False
109 |         )
110 |         self.assertEqual(
111 |             results,
112 |             {"json": {u"test_val": 1}, "subfolder": {"subfolder": {u"test_val": 2}}},
113 |         )
114 |         results = extract_json_from_folder(
115 |             "tests/files", include_subdirs=True, concat_subdir_names=True
116 |         )
117 |         self.assertEqual(
118 |             results, {"json": {u"test_val": 1}, "subfolder_subfolder": {u"test_val": 2}}
119 |         )
120 | 
121 |     def test_extract_attributes_from_folder_modules(self):
122 |         from pewtils import extract_attributes_from_folder_modules
123 | 
124 |         results = extract_attributes_from_folder_modules("tests/files", "test")
125 |         self.assertEqual(results["py"](), "test1")
126 |         results = extract_attributes_from_folder_modules(
127 |             "tests/files", "test", include_subdirs=True
128 |         )
129 |         self.assertEqual(results["py"](), "test1")
130 |         self.assertEqual(results["subfolder"]["subfolder_py"](), "test2")
131 |         results = extract_attributes_from_folder_modules(
132 |             "tests/files", "test", include_subdirs=True, concat_subdir_names=True
133 |         )
134 |         self.assertEqual(results["py"](), "test1")
135 |         self.assertEqual(results["subfolder_subfolder_py"](), "test2")
136 | 
137 |     def test_zipcode_num_to_string(self):
138 | 
139 |         from pewtils import zipcode_num_to_string
140 | 
141 |         for val in [20002, 20002.0, "20002", "20002.0"]:
142 |             zip = zipcode_num_to_string(val)
143 |             self.assertEqual(zip, "20002")
144 |         for val in ["abcde", "12", "99999", "200", "1.0", None]:
145 |             zip = zipcode_num_to_string(val)
146 |             self.assertIsNone(zip)
147 | 
148 |     def test_flatten_list(self):
149 |         from pewtils import flatten_list
150 | 
151 |         results = flatten_list([[1, 2, 3], [4, 5, 6]])
152 |         self.assertEqual(results, [1, 2, 3, 4, 5, 6])
153 | 
154 |     def test_get_hash(self):
155 |         from pewtils import get_hash
156 | 
157 |         for text, method, expected_value in [
158 |             (
159 |                 "test_string",
160 |                 "nilsimsa",
161 |                 "49c808104092202004009004800200084a0240a0c09040a1113a04a821210016",
162 |             ),
163 |             ("test_string", "md5", "3474851a3410906697ec77337df7aae4"),
164 |             ("test_string", "ssdeep", "3:HI2:Hl"),
165 |             (
166 |                 u"\u5317\u4EB0",
167 |                 "nilsimsa",
168 |                 "0100000044110004290804002820001002844001200601000101002800394081",
169 |             ),
170 |             (u"\u5317\u4EB0", "md5", "3261ad50fccf7ced43d944bbfd2acb5c"),
171 |             (u"\u5317\u4EB0", "ssdeep", "3:I2n:l"),
172 |         ]:
173 |             hash = get_hash(text, hash_function=method)
174 |             self.assertEqual(hash, expected_value)
175 | 
176 |     def test_concat_text(self):
177 |         from pewtils import concat_text
178 | 
179 |         result = concat_text(
180 |             "one two three", u"ko\u017eu\u0161\u010dek", u"\u5317\u4EB0", None
181 |         )
182 |         self.assertEqual(result, "one two three kozuscek Bei Jing ")
183 | 
184 |     def test_vector_concat_text(self):
185 |         from pewtils import vector_concat_text
186 | 
187 |         result = vector_concat_text(["one", "two", "three"], ["a", "b", "c"])
188 |         self.assertEqual(result[0], "one a")
189 |         self.assertEqual(result[1], "two b")
190 |         self.assertEqual(result[2], "three c")
191 | 
192 |     def test_cached_series_mapper(self):
193 |         import pandas as pd
194 |         from pewtils import cached_series_mapper
195 | 
196 |         df = pd.DataFrame([{"test": 1}, {"test": 2}, {"test": 3}, {"test": 3}])
197 |         df["mapped"] = cached_series_mapper(df["test"], lambda x: str(float(x)))
198 |         self.assertEqual(list(df["mapped"].values), ["1.0", "2.0", "3.0", "3.0"])
199 | 
200 |     def test_multiprocess_group_apply(self):
201 | 
202 |         import pandas as pd
203 |         from pewtils import multiprocess_group_apply
204 | 
205 |         df = pd.DataFrame([{"test": 1}, {"test": 2}, {"test": 3}, {"test": 3}])
206 |         df["group"] = [1, 1, 2, 2]
207 | 
208 |         for add, multiply, expected in [(1, 2, 6), (1, 3, 9), (2, 2, 8)]:
209 |             result = multiprocess_group_apply(
210 |                 df.groupby("group"), _test_function_agg, add, multiply=multiply
211 |             )
212 |             self.assertEqual(len(result), 2)
213 |             self.assertEqual((result == expected).astype(int).sum(), 2)
214 | 
215 |         for add, multiply, expected in [
216 |             (1, 2, [4, 6, 8, 8]),
217 |             (1, 3, [6, 9, 12, 12]),
218 |             (2, 2, [6, 8, 10, 10]),
219 |         ]:
220 | 
221 |             result = multiprocess_group_apply(
222 |                 df.groupby("group"), _test_function_map, add, multiply=multiply
223 |             )
224 |             self.assertEqual(len(result), 4)
225 |             self.assertEqual(list(result.values), expected)
226 | 
227 |     def test_scale_range(self):
228 |         from pewtils import scale_range
229 | 
230 |         self.assertEqual(scale_range(10, 5, 25, 0, 10), 2.5)
231 |         self.assertEqual(scale_range(5, 0, 10, 0, 20), 10.0)
232 | 
233 |     def test_scan_dictionary(self):
234 |         from pewtils import scan_dictionary
235 | 
236 |         test_dict = {"one": {"two": {"three": "woot"}}}
237 |         vals, paths = scan_dictionary(test_dict, "three")
238 |         self.assertEqual(vals[0], "woot")
239 |         self.assertEqual(paths[0], "one/two/three/")
240 |         vals, paths = scan_dictionary(test_dict, "doesnt_exist")
241 |         self.assertEqual(vals, [])
242 |         self.assertEqual(vals, [])
243 | 
244 |         test_dict = {
245 |             "one": {
246 |                 "two": {"three": "woot"},
247 |                 "three": {"four": "five"},
248 |                 "six": [{"three": "seven"}],
249 |             }
250 |         }
251 |         vals, paths = scan_dictionary(test_dict, "three")
252 |         self.assertEqual(len(vals), 3)
253 |         self.assertEqual(len(paths), 3)
254 |         self.assertIn("woot", vals)
255 |         self.assertIn({"four": "five"}, vals)
256 |         self.assertIn("seven", vals)
257 |         self.assertIn("one/two/three/", paths)
258 |         self.assertIn("one/three/", paths)
259 |         self.assertIn("one/six/three/", paths)
260 | 
261 |     def test_new_random_number(self):
262 |         from pewtils import new_random_number
263 |         import numpy as np
264 | 
265 |         for attempt, minimum, maximum, avg in [
266 |             (1, 1, 2, 1),
267 |             (1, 1, 10, 1),
268 |             (2, 1, 10, 2),
269 |             (3, 1, 10, 4),
270 |             (4, 1, 10, 5),
271 |             (5, 1, 10, 5),
272 |             (1, 2, 2, 2),
273 |             (1, 2, 10, 3),
274 |             (2, 2, 10, 4),
275 |             (3, 2, 10, 5),
276 |             (4, 2, 10, 5),
277 |             (5, 2, 10, 5),
278 |         ]:
279 |             attempts = [
280 |                 new_random_number(attempt=attempt, minimum=minimum, maximum=maximum)
281 |                 for i in range(500)
282 |             ]
283 |             self.assertGreaterEqual(np.min(attempts), minimum)
284 |             self.assertLessEqual(np.max(attempts), maximum)
285 |             self.assertGreaterEqual(round(np.average(attempts)), avg)
286 | 
287 |     def test_timeout_wrapper(self):
288 |         from pewtils import timeout_wrapper
289 |         import time
290 | 
291 |         def test(sleep):
292 |             try:
293 |                 with timeout_wrapper(2):
294 |                     time.sleep(sleep)
295 |                 return True
296 |             except:
297 |                 return False
298 | 
299 |         self.assertFalse(test(3))
300 |         self.assertTrue(test(1))
301 | 
302 |     def test_print_execution_time(self):
303 | 
304 |         import re
305 |         import time
306 |         from io import StringIO
307 |         from pewtils import PrintExecutionTime
308 | 
309 |         temp = StringIO()
310 |         with PrintExecutionTime(label="my function", stdout=temp):
311 |             time.sleep(5)
312 |         temp.seek(0)
313 |         output = temp.getvalue()
314 |         self.assertIsNotNone(re.match(r"my function: 5\.[0-9]+ seconds", output))
315 | 
316 |     def tearDown(self):
317 |         pass
318 | 
319 | 
320 | def _test_function_agg(grp, add, multiply=1):
321 |     return (len(grp) + add) * multiply
322 | 
323 | 
324 | def _test_function_map(grp, add, multiply=1):
325 |     return grp["test"].map(lambda x: (x + add) * multiply)
326 | 


--------------------------------------------------------------------------------
/pewtils/io.py:
--------------------------------------------------------------------------------
  1 | from builtins import object
  2 | from contextlib import closing
  3 | from pewtils import is_not_null
  4 | from scandir import scandir
  5 | import boto3
  6 | import datetime
  7 | import hashlib
  8 | import json
  9 | import os
 10 | import pandas as pd
 11 | import pickle as pickle
 12 | import time
 13 | 
 14 | try:
 15 |     from io import StringIO, BytesIO
 16 | 
 17 | except ImportError:
 18 |     from StringIO import StringIO as BytesIO
 19 |     from StringIO import StringIO
 20 | 
 21 | 
 22 | class FileHandler(object):
 23 | 
 24 |     """
 25 |     Read/write data files in a variety of formats, locally and in Amazon S3 buckets.
 26 | 
 27 |     :param path: A valid path to the folder in local or s3 directory where files will be written to or read from
 28 |     :type path: str
 29 |     :param use_s3: Whether the path is an S3 location or local location
 30 |     :type use_s3: bool
 31 |     :param bucket: The name of the S3 bucket, required if ``use_s3=True``; will also try to fetch from the environment \
 32 |     as S3_BUCKET
 33 |     :type bucket: str
 34 | 
 35 |     .. note:: Typical rectangular data files (i.e. ``csv``, ``tab``, ``xlsx``, ``xls``, ``dta`` file extension types) will be \
 36 |         read to/written from a :py:class:`pandas.DataFrame` object. The exceptions are `pkl` and `json` objects which \
 37 |         accept any serializable Python object and correctly-formatted JSON object respectively.
 38 | 
 39 |     .. tip:: You can configure your environment to make it easier to automatically connect to S3 by defining the \
 40 |         variable ``S3_BUCKET``.
 41 | 
 42 |     Usage::
 43 | 
 44 |         from pewtils.io import FileHandler
 45 | 
 46 |         >>> h = FileHandler("./", use_s3=False)  # current local folder
 47 |         >>> df = h.read("my_csv", format="csv")
 48 |         # Do something and save to Excel
 49 |         >>> h.write("my_new_csv", df, format="xlsx")
 50 | 
 51 |         >>> my_data = [{"key": "value"}]
 52 |         >>> h.write("my_data", my_data, format="json")
 53 | 
 54 |         >>> my_data = ["a", "python", "list"]
 55 |         >>> h.write("my_data", my_data, format="pkl")
 56 | 
 57 |         # To read/write to an S3 bucket
 58 |         # The FileHandler detects your AWS tokens using boto3's standard methods to find them in ~/.aws or defined as environment variables.
 59 |         >>> h = FileHandler("/my_folder", use_s3=True, bucket="my-bucket")
 60 |     """
 61 | 
 62 |     def __init__(self, path, use_s3=None, bucket=None):
 63 |         self.bucket = os.environ.get("S3_BUCKET", None) if bucket is None else bucket
 64 |         self.path = path
 65 |         self.use_s3 = use_s3 if is_not_null(self.bucket) else False
 66 |         if self.use_s3:
 67 |             s3_params = {}
 68 |             self.s3 = boto3.client("s3")
 69 | 
 70 |         else:
 71 |             self.path = os.path.join(self.path)
 72 |             if not os.path.exists(self.path):
 73 |                 try:
 74 |                     os.makedirs(self.path)
 75 | 
 76 |                 except Exception as e:
 77 |                     print("Warning: couldn't make directory '{}'".format(self.path))
 78 |                     print(e)
 79 | 
 80 |     def iterate_path(self):
 81 | 
 82 |         """
 83 |         Iterates over the directory and returns a list of filenames or S3 object keys
 84 | 
 85 |         :return: Yields a list of filenames or S3 keys
 86 |         :rtype: iterable
 87 | 
 88 |         Usage::
 89 | 
 90 |             from pewtils.io import FileHandler
 91 | 
 92 |             >>> h = FileHandler("./", use_s3=False)
 93 |             >>> for file in h.iterate_path(): print(file)
 94 |             file1.csv
 95 |             file2.pkl
 96 |             file3.json
 97 | 
 98 |         """
 99 | 
100 |         if self.use_s3:
101 |             for key in self.s3.list_objects(Bucket=self.bucket, Prefix=self.path)['Contents']:
102 |                 yield key["Key"]
103 | 
104 |         else:
105 |             for f in scandir(self.path):
106 |                 yield f.name
107 | 
108 |     def clear_folder(self):
109 |         """
110 |         Deletes the path (if local) or unlinks all keys in the bucket folder (if S3)
111 | 
112 |         .. warning:: This is a destructive function, use with caution!
113 | 
114 |         Usage::
115 | 
116 |             from pewtils.io import FileHandler
117 | 
118 |             >>> h = FileHandler("./", use_s3=False)
119 |             >>> len(list(h.iterate_path()))
120 |             3
121 |             >>> h.clear_folder()
122 |             >>> len(list(h.iterate_path()))
123 |             0
124 | 
125 |         """
126 | 
127 |         if self.use_s3:
128 |             for key in self.s3.list_objects(Bucket=self.bucket, Prefix=self.path)['Contents']:
129 |                 self.s3.delete_object(Bucket=self.bucket, Prefix=key['Key'])
130 | 
131 |         else:
132 |             for f in scandir(self.path):
133 |                 os.unlink(os.path.join(self.path, f.name))
134 | 
135 |     def clear_file(self, key, format="pkl", hash_key=False):
136 |         """
137 |         Deletes a specific file.
138 | 
139 |         .. warning:: This is a destructive function, use with caution!
140 | 
141 |         :param key: The name of the file to delete
142 |         :type key: str
143 |         :param format: The file extension
144 |         :type format: str
145 |         :param hash_key: If True, will hash the filename before looking it up; default is False.
146 |         :type hash_key: bool
147 | 
148 |         Usage::
149 | 
150 |             from pewtils.io import FileHandler
151 | 
152 |             >>> h = FileHandler("./", use_s3=False)
153 |             >>> for file in h.iterate_path(): print(file)
154 |             file1.csv
155 |             file2.pkl
156 |             file3.json
157 |             >>> h.clear_file("file1", format="csv")
158 |             >>> for file in h.iterate_path(): print(file)
159 |             file2.pkl
160 |             file3.json
161 | 
162 |         """
163 | 
164 |         if hash_key:
165 |             key = self.get_key_hash(key)
166 | 
167 |         if self.use_s3:
168 |             filepath = "/".join([self.path, "{}.{}".format(key, format)])
169 |             key = self.s3.delete_object(Bucket=self.bucket, Key=filepath)
170 | 
171 |         else:
172 |             key += ".{}".format(format)
173 |             path = os.path.join(self.path, key)
174 |             os.unlink(path)
175 | 
176 |     def get_key_hash(self, key):
177 | 
178 |         """
179 |         Converts a key to a hashed representation. Allows you to pass arbitrary objects and convert their string \
180 |         representation into a shorter hashed key, so it can be useful for caching. You can call this method \
181 |         directly to see the hash that a key will be converted into, but this method is mainly used in conjunction \
182 |         with the :py:meth:`pewtils.FileHandler.write` and :py:meth:`pewtils.FileHandler.read` methods by passing in \
183 |         ``hash_key=True``.
184 | 
185 |         :param key: A raw string or Python object that can be meaningfully converted into a string representation
186 |         :type key: str or object
187 |         :return: A SHA224 hash representation of that key
188 |         :rtype: str
189 | 
190 |         Usage::
191 | 
192 |             from pewtils.io import FileHandler
193 | 
194 |             >>> h = FileHandler("tests/files", use_s3=False)
195 |             >>> h.get_key_hash("temp")
196 |             "c51bf90ccb22befa316b7a561fe9d5fd9650180b14421fc6d71bcd57"
197 |             >>> h.get_key_hash({"key": "value"})
198 |             "37e13e1116c86a6e9f3f8926375c7cb977ca74d2d598572ced03cd09"
199 | 
200 |         """
201 | 
202 |         try:
203 |             return hashlib.sha224(key.encode("utf8")).hexdigest()
204 |         except AttributeError:
205 |             return hashlib.sha224(str(key).encode("utf8")).hexdigest()
206 | 
207 |     def write(
208 |         self, key, data, format="pkl", hash_key=False, add_timestamp=False, **io_kwargs
209 |     ):
210 | 
211 |         """
212 |         Writes arbitrary data objects to a variety of file formats.
213 | 
214 | 
215 |         :param key: The name of the file or key (without a file suffix!)
216 |         :type key: str
217 |         :param data: The actual data to write to the file
218 |         :type data: object
219 |         :param format: The format the data should be saved in (pkl/csv/tab/xlsx/xls/dta/json). Defaults to pkl. \
220 |         This will be used as the file's suffix.
221 |         :type format: str
222 |         :param hash_key: Whether or not to hash the provided key before saving the file. (Default=False)
223 |         :type hash_key: bool
224 |         :param add_timestamp: Optionally add a timestamp to the filename
225 |         :type add_timestamp: bool
226 |         :param io_kwargs: Additional parameters to pass along to the Pandas save function, if applicable
227 |         :return: None
228 | 
229 |         .. note:: When saving a ``csv``, ``tab``, ``xlsx``, ``xls``, or ``dta`` file, this function expects to receive a \
230 |             Pandas :py:class:`pandas.DataFrame`. When you use these formats, you can also pass optional ``io_kwargs`` \
231 |             which will be forwarded to the corresponding :py:mod:`pandas` method below:
232 | 
233 |                 - `dta`: :py:meth:`pandas.DataFrame.to_stata`
234 |                 - `csv`: :py:meth:`pandas.DataFrame.to_csv`
235 |                 - `tab`: :py:meth:`pandas.DataFrame.to_csv`
236 |                 - `xlsx`: :py:meth:`pandas.DataFrame.to_excel`
237 |                 - `xls`: :py:meth:`pandas.DataFrame.to_excel`
238 | 
239 |             If you're trying to save an object to JSON, it assumes that you're passing it valid JSON. By default, \
240 |             the handler attempts to use pickling, allowing you to save anything you want, as long as it's serializable.
241 | 
242 |         """
243 | 
244 |         format = format.strip(".")
245 | 
246 |         if hash_key:
247 |             key = self.get_key_hash(key)
248 | 
249 |         if add_timestamp:
250 |             key = "{}_{}".format(key, datetime.datetime.now())
251 | 
252 |         def _get_output(output, data, io_kwargs):
253 |             if format == "tab":
254 |                 io_kwargs["sep"] = "\t"
255 |             if format in ["csv", "tab"]:
256 |                 data.to_csv(output, encoding="utf8", **io_kwargs)
257 |             elif format == "dta":
258 |                 data.to_stata(output, **io_kwargs)
259 |             elif format in ["xls", "xlsx"]:
260 |                 writer = pd.ExcelWriter(output, engine="xlsxwriter")
261 |                 data.to_excel(writer, **io_kwargs)
262 |                 writer.save()
263 |             data = output.getvalue()
264 |             return data
265 | 
266 |         if format in ["csv", "xls", "xlsx", "tab", "dta"]:
267 |             try:
268 |                 data = _get_output(BytesIO(), data, io_kwargs)
269 |             except Exception as e:
270 |                 try:
271 |                     data = _get_output(StringIO(), data, io_kwargs)
272 |                 except:
273 |                     raise Exception(
274 |                         "Couldn't convert data into '{}' format".format(format)
275 |                     )
276 | 
277 |         elif format == "pkl":
278 |             data = pickle.dumps(data, **io_kwargs)
279 |         elif format == "json":
280 |             data = json.dumps(data, **io_kwargs)
281 | 
282 |         key += ".{}".format(format)
283 | 
284 |         if self.use_s3:
285 |             try:
286 |                 upload = BytesIO(data)
287 | 
288 |             except TypeError:
289 |                 upload = BytesIO(data.encode())
290 | 
291 |             self.s3.upload_fileobj(upload, Bucket=self.bucket, Key="/".join([self.path, key]))
292 | 
293 |         else:
294 |             path = os.path.join(self.path, key)
295 |             if os.path.exists(self.path):
296 |                 try:
297 |                     with closing(open(path, "w")) as output:
298 |                         output.write(data)
299 |                 except:
300 |                     with closing(open(path, "wb")) as output:
301 |                         output.write(data)
302 | 
303 |     def read(self, key, format="pkl", hash_key=False, **io_kwargs):
304 | 
305 |         """
306 |         Reads a file from the directory or S3 path, returning its contents.
307 | 
308 |         :param key: The name of the file to read (without a suffix!)
309 |         :type key: str
310 |         :param format: The format of the file (pkl/json/csv/dta/xls/xlsx/tab); expects the file extension to match
311 |         :type format: str
312 |         :param hash_key: Whether the key should be hashed prior to looking for and retrieving the file.
313 |         :type hash_key: bool
314 |         :param io_kwargs: Optional arguments to be passed to the specific load function (dependent on file format)
315 |         :return: The file contents, in the requested format
316 | 
317 |         .. note:: You can pass optional ``io_kwargs`` that will be forwarded to the function below that corresponds to \
318 |             the format of the file you're trying to read in
319 | 
320 |             - `dta`: :py:meth:`pandas.DataFrame.read_stata`
321 |             - `csv`: :py:meth:`pandas.DataFrame.read_csv`
322 |             - `tab`: :py:meth:`pandas.DataFrame.read_csv`
323 |             - `xlsx`: :py:meth:`pandas.DataFrame.read_excel`
324 |             - `xls`: :py:meth:`pandas.DataFrame.read_excel`
325 |         """
326 | 
327 |         format = format.strip(".")
328 | 
329 |         if hash_key:
330 |             key = self.get_key_hash(key)
331 | 
332 |         data = None
333 |         filepath = "/".join([self.path, "{}.{}".format(key, format)])
334 | 
335 |         if self.use_s3:
336 |             try:
337 |                 data = StringIO()
338 | 
339 |             except TypeError:
340 |                 data = BytesIO()
341 | 
342 |             self.s3.download_fileobj(data, Bucket=self.bucket, Key=filepath)
343 |             data = data.getvalue()
344 |         else:
345 |             if os.path.exists(filepath):
346 |                 try:
347 |                     with closing(open(filepath, "r")) as infile:
348 |                         data = infile.read()
349 | 
350 |                 except:
351 |                     # TODO: handle this exception more explicitly
352 |                     with closing(open(filepath, "rb")) as infile:
353 |                         data = infile.read()
354 | 
355 |         if is_not_null(data):
356 |             if format == "pkl":
357 |                 try:
358 |                     data = pickle.loads(data)
359 | 
360 |                 except TypeError:
361 |                     data = None
362 | 
363 |                 except ValueError:
364 |                     if "attempt_count" not in io_kwargs:
365 |                         io_kwargs["attempt_count"] = 1
366 | 
367 |                     print(
368 |                         "Insecure pickle string; probably a concurrent read-write, \
369 |                         will try again in 5 seconds (attempt #{})".format(
370 |                             io_kwargs["attempt_count"]
371 |                         )
372 |                     )
373 |                     time.sleep(5)
374 | 
375 |                     if io_kwargs["attempt_count"] <= 3:
376 |                         io_kwargs["attempt_count"] += 1
377 |                         data = self.read(
378 |                             key, format=format, hash_key=hash_key, **io_kwargs
379 |                         )
380 | 
381 |                     else:
382 |                         data = None
383 | 
384 |                 except Exception as e:
385 |                     print("Couldn't load pickle!  {}".format(e))
386 |                     data = None
387 | 
388 |             elif format in ["tab", "csv"]:
389 |                 if format == "tab":
390 |                     io_kwargs["delimiter"] = "\t"
391 | 
392 |                 try:
393 |                     data = pd.read_csv(BytesIO(data), **io_kwargs)
394 | 
395 |                 except:
396 |                     data = pd.read_csv(StringIO(data), **io_kwargs)
397 | 
398 |             elif format in ["xlsx", "xls"]:
399 |                 # https://stackoverflow.com/questions/64264563/attributeerror-elementtree-object-has-no-attribute-getiterator-when-trying
400 |                 if "engine" not in io_kwargs:
401 |                     io_kwargs["engine"] = "openpyxl"
402 | 
403 |                 try:
404 |                     data = pd.read_excel(BytesIO(data), **io_kwargs)
405 | 
406 |                 except:
407 |                     data = pd.read_excel(StringIO(data), **io_kwargs)
408 | 
409 |             elif format == "json":
410 |                 try:
411 |                     data = json.loads(data)
412 | 
413 |                 except:
414 |                     pass
415 | 
416 |             elif format == "dta":
417 |                 try:
418 |                     data = pd.read_stata(BytesIO(data), **io_kwargs)
419 | 
420 |                 except:
421 |                     data = pd.read_stata(StringIO(data), **io_kwargs)
422 | 
423 |             elif format == "txt":
424 |                 if isinstance(data, bytes):
425 |                     data = data.decode()
426 | 
427 |         return data
428 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.


--------------------------------------------------------------------------------
/pewtils/http.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from bs4 import BeautifulSoup
  3 | from builtins import str
  4 | from pewtils import get_hash, decode_text, is_not_null
  5 | from six.moves.urllib import parse as urlparse
  6 | from unidecode import unidecode
  7 | import pandas as pd
  8 | import re
  9 | import os
 10 | import requests
 11 | import tldextract
 12 | import warnings
 13 | from requests.exceptions import ReadTimeout
 14 | from stopit import ThreadingTimeout as Timeout
 15 | 
 16 | 
 17 | _ = pd.read_csv(
 18 |     os.path.join(
 19 |         os.path.dirname(os.path.abspath(__file__)), "general_link_shorteners.csv"
 20 |     )
 21 | )
 22 | GENERAL_LINK_SHORTENERS = _["shortener"].values
 23 | 
 24 | 
 25 | _ = pd.read_csv(
 26 |     os.path.join(
 27 |         os.path.dirname(os.path.abspath(__file__)), "vanity_link_shorteners.csv"
 28 |     )
 29 | )
 30 | _ = _[_["historical"] == 0]
 31 | VANITY_LINK_SHORTENERS = dict(zip(_["shortener"], _["expanded"]))
 32 | 
 33 | _ = pd.read_csv(
 34 |     os.path.join(
 35 |         os.path.dirname(os.path.abspath(__file__)), "vanity_link_shorteners.csv"
 36 |     )
 37 | )
 38 | _ = _[_["historical"] == 1]
 39 | HISTORICAL_VANITY_LINK_SHORTENERS = dict(zip(_["shortener"], _["expanded"]))
 40 | 
 41 | VANITY_LINK_SHORTENERS.update(HISTORICAL_VANITY_LINK_SHORTENERS)
 42 | 
 43 | 
 44 | def hash_url(url):
 45 | 
 46 |     """
 47 |     Clears out http/https prefix and returns an MD5 hash of the URL. More effective \
 48 |     when used in conjunction with :py:func:`pewtils.http.canonical_link`.
 49 | 
 50 |     :param url: The URL to hash
 51 |     :type url: str
 52 |     :return: Hashed string representation of the URL using the md5 hashing algorithm.
 53 |     :rtype: str
 54 | 
 55 |     Usage::
 56 | 
 57 |         from pewtils.http import hash_url
 58 | 
 59 |         >>> hash_url("http://www.example.com")
 60 |         "7c1767b30512b6003fd3c2e618a86522"
 61 |         >>> hash_url("www.example.com")
 62 |         "7c1767b30512b6003fd3c2e618a86522"
 63 | 
 64 |     """
 65 | 
 66 |     http_regex = re.compile(r"^http(s)?\:\/\/")
 67 |     with warnings.catch_warnings():
 68 |         warnings.simplefilter("ignore")
 69 |         result = get_hash(
 70 |             unidecode(http_regex.sub("", url.lower())), hash_function="md5"
 71 |         )
 72 |         return result
 73 | 
 74 | 
 75 | def strip_html(html, simple=False, break_tags=None):
 76 | 
 77 |     """
 78 |     Attempts to strip out HTML code from an arbitrary string while preserving meaningful text components. \
 79 |     By default, the function will use BeautifulSoup to parse the HTML. Setting ``simple=True`` will make the \
 80 |     function use a much simpler regular expression approach to parsing.
 81 | 
 82 |     :param html: The HTML to process
 83 |     :type html: str
 84 |     :param simple: Whether or not to use a simple regex or more complex parsing rules (default=False)
 85 |     :type simple: bool
 86 |     :param break_tags: A custom list of tags on which to break (default is ["strong", "em", "i", "b", "p"])
 87 |     :type break_tags: list
 88 |     :return: The text with HTML components removed
 89 |     :rtype: str
 90 | 
 91 |     .. note: This function might not be effective for *all* variations of HTML structures, but it produces fairly \
 92 |         reliable results in removing the vast majority of HTML without stripping out valuable content.
 93 | 
 94 |     Usage::
 95 | 
 96 |         from pewtils.http import strip_html
 97 | 
 98 |         >>> my_html = "<html><head>Header text</head><body>Body text</body></html>"
 99 |         >>> strip_html(my_html)
100 |         'Header text Body text'
101 | 
102 |     """
103 | 
104 |     html = re.sub(r"\n", " ", html)
105 |     html = re.sub(r"\s+", " ", html)
106 |     if not break_tags:
107 |         break_tags = ["strong", "em", "i", "b", "p"]
108 |     if not simple:
109 |         try:
110 | 
111 |             split_re = re.compile(r"\s{2,}")
112 |             soup = BeautifulSoup(html, "lxml")
113 |             for tag in soup():
114 |                 if (
115 |                     "class" in tag.attrs
116 |                     and ("menu" in tag.attrs["class"] or "header" in tag.attrs["class"])
117 |                 ) or ("menu" in str(tag.id) or "header" in str(tag.id)):
118 |                     tag.extract()
119 |             for tag in soup(["script", "style"]):
120 |                 tag.extract()
121 |             for br in soup.find_all("br"):
122 |                 br.replace_with("\n")
123 |             for t in soup(break_tags):
124 |                 try:
125 |                     t.replace_with("\n{0}\n".format(t.text))
126 |                 except (UnicodeDecodeError, UnicodeEncodeError):
127 |                     t.replace_with("\n{0}\n".format(decode_text(t.text)))
128 |             if hasattr(soup, "body") and soup.body:
129 |                 text = soup.body.get_text()
130 |             else:
131 |                 text = soup.get_text()
132 |             lines = [l.strip() for l in text.splitlines()]
133 |             lines = [l2.strip() for l in lines for l2 in split_re.split(l)]
134 |             text = "\n".join([l for l in lines if l])
135 |             text = re.sub(r"(\sA){2,}\s", " ", text)
136 |             text = re.sub(r"\n+(\s+)?", "\n\n", text)
137 |             text = re.sub(r" +", " ", text)
138 |             text = re.sub(r"\t+", " ", text)
139 | 
140 |             return text
141 | 
142 |         except Exception as e:
143 | 
144 |             print("strip_html error")
145 |             print(e)
146 |             text = re.sub(r"<[^>]*>", " ", re.sub("\\s+", " ", html)).strip()
147 |             return text
148 | 
149 |     else:
150 |         return "\n".join(
151 |             [
152 |                 re.sub(r"\s+", " ", re.sub(r"\<[^\>]+\>", " ", section))
153 |                 for section in re.sub(r"\<\/?div\>|\<\/?p\>|\<br\>", "\n", html).split(
154 |                     "\n"
155 |                 )
156 |             ]
157 |         )
158 | 
159 | 
160 | def trim_get_parameters(url, session=None, timeout=30, user_agent=None):
161 | 
162 |     """
163 |     Takes a URL (presumed to be the final end point) and iterates over GET parameters, attempting to find optional
164 |     ones that can be removed without generating any redirects.
165 | 
166 |     :param url: The URL to trim
167 |     :type url: str
168 |     :param session: (Optional) A persistent session that can optionally be passed (useful if you're processing many \
169 |     links at once)
170 |     :type session: :py:class:`requests.Session` object
171 |     :param user_agent: User agent for the auto-created requests Session to use, if a preconfigured requests Session \
172 |     is not provided
173 |     :type user_agent: str
174 |     :param timeout: Timeout for requests
175 |     :type timeout: int or float
176 |     :return: The original URL with optional GET parameters removed
177 |     :rtype: str
178 | 
179 |     Usage::
180 | 
181 |         from pewtils.http import trim_get_parameters
182 | 
183 |         >>> trim_get_parameters("https://httpbin.org/status/200?param=1")
184 |         "https://httpbin.org/status/200"
185 | 
186 |     """
187 | 
188 |     close_session = False
189 |     if not session:
190 |         close_session = True
191 |         session = requests.Session()
192 |         session.headers.update({"User-Agent": user_agent})
193 | 
194 |     # Often there's extra information about social sharing and referral sources that can be removed
195 |     ditch_params = []
196 |     parsed = urlparse.urlparse(url)
197 |     if parsed.query:
198 |         params = urlparse.parse_qs(parsed.query)
199 |         for k, v in params.items():
200 |             # We iterate over all of the GET parameters and try holding each one out
201 |             check = True
202 |             for skipper in ["document", "article", "id", "qs"]:
203 |                 # If the parameter is named something that's probably a unique ID, we'll keep it
204 |                 if skipper in k.lower():
205 |                     check = False
206 |             for skipper in ["html", "http"]:
207 |                 # Same goes for parameters that contain URL information
208 |                 if skipper in v[0].lower():
209 |                     check = False
210 |             if check:
211 |                 new_params = {
212 |                     k2: v2[0] for k2, v2 in params.items() if k2 != k and len(v2) == 1
213 |                 }
214 |                 new_params = urlparse.urlencode(new_params)
215 |                 new_parsed = parsed._replace(query=new_params)
216 |                 new_url = urlparse.urlunparse(new_parsed)
217 |                 try:
218 |                     resp = session.head(new_url, allow_redirects=True, timeout=timeout)
219 |                 except ReadTimeout:
220 |                     resp = None
221 |                 if is_not_null(resp):
222 |                     new_parsed = urlparse.urlparse(resp.url)
223 |                     if new_parsed.query != "" or new_parsed.path not in ["", "/"]:
224 |                         # If removing a parameter didn't redirect to a root domain...
225 |                         new_url = resp.url
226 |                         compare_new = (
227 |                             new_url.split("?")[0] if "?" in new_url else new_url
228 |                         )
229 |                         compare_old = url.split("?")[0] if "?" in url else url
230 |                         if compare_new == compare_old:
231 |                             # And the domain is the same as it was before, then the parameter was probably unnecessary
232 |                             ditch_params.append(k)
233 | 
234 |     if len(ditch_params) > 0:
235 |         # Now we remove all of the unnecessary get parameters and finalize the URL
236 |         new_params = {
237 |             k: v[0] for k, v in params.items() if len(v) == 1 and k not in ditch_params
238 |         }
239 |         new_params = urlparse.urlencode(new_params)
240 |         parsed = parsed._replace(query=new_params)
241 |         url = urlparse.urlunparse(parsed)
242 | 
243 |     if close_session:
244 |         session.close()
245 | 
246 |     return url
247 | 
248 | 
249 | def extract_domain_from_url(
250 |     url,
251 |     include_subdomain=True,
252 |     resolve_url=False,
253 |     timeout=1.0,
254 |     session=None,
255 |     user_agent=None,
256 |     expand_shorteners=True,
257 | ):
258 | 
259 |     """
260 |     Attempts to extract a standardized domain from a url by following the link and extracting the TLD.
261 | 
262 |     :param url:  The link from which to extract the domain
263 |     :type url: str
264 |     :param include_subdomain: Whether or not to include the subdomain (e.g. 'news.google.com'); default is True
265 |     :type include_subdomain: bool
266 |     :param resolve_url: Whether to fully resolve the URL.  If False (default), it will operate on the URL as-is; if \
267 |     True, the URL will be passed to :py:func:`pewtils.http.canonical_link` to be standardized prior to extracting the \
268 |     domain.
269 |     :param timeout: (Optional, for use with ``resolve_url``) Maximum number of seconds to wait on a request before \
270 |     timing out (default is 1)
271 |     :type timeout: int or float
272 |     :param session: (Optional, for use with ``resolve_url``) A persistent session that can optionally be passed \
273 |     (useful if you're processing many links at once)
274 |     :type session: :py:class:`requests.Session` object
275 |     :param user_agent: (Optional, for use with ``resolve_url``) User agent for the auto-created requests Session to use, \
276 |     if a preconfigured requests Session is not provided
277 |     :type user_agent: str
278 |     :param expand_shorteners: If True, shortened URLs that don't successfully expand will be checked against a list \
279 |     of known URL shorteners and expanded if recognized. (Default = True)
280 |     :type expand_shorteners: bool
281 |     :return: The domain for the link
282 |     :rtype: str
283 | 
284 |     .. note:: If ``resolve_url`` is set to True, the link will be standardized prior to domain extraction (in which \
285 |         case you can provide optional timeout, session, and user_agent parameters that will be passed to \
286 |         :py:func:`pewtils.http.canonical_link`). By default, however, the link will be operated on as-is. The final \
287 |         extracted domain is then checked against known URL shorteners (see :ref:`vanity_link_shorteners`) and if it \
288 |         is recognized, the expanded domain will be returned instead. Shortened URLs that are not standardized and \
289 |         do not follow patterns included in this dictionary of known shorteners may be returned with an incorrect domain.
290 | 
291 |     Usage::
292 | 
293 |         from pewtils.http import extract_domain_from_url
294 | 
295 |         >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=False)
296 |         "bbc.co.uk"
297 |         >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=True)
298 |         "forums.bbc.co.uk"
299 | 
300 |     """
301 | 
302 |     if resolve_url:
303 |         url = canonical_link(
304 |             url, timeout=timeout, session=session, user_agent=user_agent
305 |         )
306 |     domain = tldextract.extract(url)
307 |     if domain:
308 |         if include_subdomain and domain.subdomain and domain.subdomain != "www":
309 |             domain = ".".join([domain.subdomain, domain.domain, domain.suffix])
310 |         else:
311 |             domain = ".".join([domain.domain, domain.suffix])
312 |         if expand_shorteners:
313 |             domain = VANITY_LINK_SHORTENERS.get(domain, domain)
314 |     return domain
315 | 
316 | 
317 | def canonical_link(url, timeout=5.0, session=None, user_agent=None):
318 | 
319 |     """
320 |     Tries to resolve a link to the "most correct" version.
321 | 
322 |     Useful for expanding short URLs from bit.ly / Twitter and for checking HTTP status codes without retrieving \
323 |     the actual data. Follows redirects and tries to pick the most informative version of a URL while avoiding \
324 |     redirects to generic 404 pages. Also tries to iteratively remove optional GET parameters.
325 | 
326 |     May not be particularly effective on dead links, but may still be able to follow redirects enough \
327 |     to return a URL with the correct domain associated with the original link.
328 | 
329 |     :param url: The URL to test. Should be fully qualified.
330 |     :type url: str
331 |     :param timeout: How long to wait for a response before giving up (default is one second)
332 |     :type timeout: int or float
333 |     :param session: (Optional) A persistent session that can optionally be passed (useful if you're processing many \
334 |     links at once)
335 |     :type session: :py:class:`requests.Session` object
336 |     :param user_agent: User agent for the auto-created requests Session to use, if a preconfigured requests Session \
337 |     is not provided
338 |     :type user_agent: str
339 |     :return: The "canonical" URL as supplied by the server, or the original URL if none supplied.
340 |     :rtype: str
341 | 
342 |     .. note:: See :ref:`link_shorteners` for a complete list of shortened links recognized by this function.
343 | 
344 |         This function might not resolve *all* existing URL modificiations, but it has been tested on a vast, well \
345 |         maintained variety of URLs. It typically resolves URL to the correct final page while avoiding redirects to \
346 |         generic error pages.
347 | 
348 |     Usage::
349 | 
350 |         from pewtils.http import canonical_link
351 | 
352 |         >>> canonical_link("https://pewrsr.ch/2lxB0EX")
353 |         "https://www.pewresearch.org/interactives/how-does-a-computer-see-gender/"
354 | 
355 |     """
356 | 
357 |     BAD_STATUS_CODES = [
358 |         302,
359 |         307,
360 |         400,
361 |         404,
362 |         405,
363 |         407,
364 |         500,
365 |         501,
366 |         502,
367 |         503,
368 |         504,
369 |         520,
370 |         530,
371 |         404,
372 |     ]
373 |     PROXY_REQUIRED = [307, 407]
374 |     CHECK_LENGTH = [301, 302, 200, 404]
375 | 
376 |     close_session = False
377 |     if not session:
378 |         close_session = True
379 |         session = requests.Session()
380 |         session.headers.update({"User-Agent": user_agent})
381 |     if not url.startswith("http"):
382 |         url = "http://" + url
383 |     response = None
384 |     try:
385 |         with Timeout(timeout):
386 |             try:
387 |                 response = session.head(url, allow_redirects=True, timeout=timeout)
388 |             except requests.ConnectionError:
389 |                 try:
390 |                     response = session.head(url, allow_redirects=False, timeout=timeout)
391 |                 except:
392 |                     pass
393 |     except:
394 |         pass
395 | 
396 |     if response:
397 | 
398 |         history = [(h.status_code, h.url) for h in response.history]
399 |         history.append((response.status_code, response.url))
400 | 
401 |         last_good_url = history[0][1]
402 |         original_parsed = urlparse.urlparse(last_good_url)
403 |         has_path = original_parsed.path not in ["/", ""]
404 |         has_query = original_parsed.query != ""
405 |         prev_was_shortener = False
406 |         prev_path = None
407 |         prev_query = None
408 |         status_code = None
409 |         for i, resp in enumerate(history):
410 |             status_code, response_url = resp
411 |             if "errors/404" in response_url:
412 |                 # If it's clearly a 404 landing page, stop and use the last observed good URL
413 |                 break
414 |             parsed = urlparse.urlparse(response_url)
415 |             if (
416 |                 parsed.netloc in VANITY_LINK_SHORTENERS.keys()
417 |                 or parsed.netloc in GENERAL_LINK_SHORTENERS
418 |             ):
419 |                 # Don't consider known shortened URLs
420 |                 is_shortener = True
421 |             else:
422 |                 is_shortener = False
423 |             if not is_shortener:
424 |                 if i != 0:
425 |                     for param, val in urlparse.parse_qs(parsed.query).items():
426 |                         if len(val) == 1 and val[0].startswith("http"):
427 |                             parsed_possible_url = urlparse.urlparse(val[0])
428 |                             if (
429 |                                 parsed_possible_url.scheme
430 |                                 and parsed_possible_url.netloc
431 |                             ):
432 |                                 # If the URL contains a GET parameter that is, itself, a URL, it's likely redirecting
433 |                                 # to it, so we're going to stop this run and start the process over with the new URL
434 |                                 return canonical_link(
435 |                                     val[0],
436 |                                     timeout=timeout,
437 |                                     session=session,
438 |                                     user_agent=user_agent,
439 |                                 )
440 |                 if status_code in PROXY_REQUIRED:
441 |                     # These codes tend to indicate the last good URL in the chain
442 |                     last_good_url = response_url
443 |                     break
444 |                 good_path = not has_path or parsed.path not in ["/", ""]
445 |                 good_query = not has_query or parsed.query != ""
446 |                 # If the URL has a path or some GET parameters, we'll inspect further
447 |                 # Otherwise we just go with the previous URL
448 |                 # Link shorteners are very rarely used to reference root domains
449 |                 if good_query or good_path:
450 |                     if (
451 |                         re.sub("https", "http", response_url)
452 |                         == re.sub("https", "http", last_good_url)
453 |                         or parsed.path == original_parsed.path
454 |                     ) or response_url.lower() == last_good_url.lower():
455 |                         # If it's the same link but only the domain, protocol, or casing changed, it's fine
456 |                         last_good_url = response_url
457 |                     elif i != 0 and status_code in CHECK_LENGTH:
458 |                         # For these codes, we're going to see how much the link changed
459 |                         # Redirects and 404s sometimes preserve a decent URL, sometimes they go to a landing page
460 |                         # The following cutoffs seem to do a good job most of the time:
461 |                         # 1) The new URL has a long domain more than 7 characters, so it's not likely a shortened URL
462 |                         # 2) The prior URL had a long path and this one has fewer than 20 characters and it wasn't
463 |                         # swapped out for GET params
464 |                         # 3) Or the prior URL had GET params and this one has far fewer and no replacement path
465 |                         # If these conditions are met and the path or query do not identically match the prior link
466 |                         # Then it's usually a generic error page
467 |                         bad = False
468 |                         if (
469 |                             has_path
470 |                             and len(parsed.netloc) > 7
471 |                             and len(parsed.path) < 20
472 |                             and len(parsed.query) == 0
473 |                             and prev_path != parsed.path
474 |                         ) or (
475 |                             has_query
476 |                             and len(parsed.netloc) > 7
477 |                             and len(parsed.query) < 20
478 |                             and len(parsed.path) <= 1
479 |                             and prev_query != parsed.query
480 |                         ):
481 |                             bad = True
482 |                         if not bad or prev_was_shortener:
483 |                             last_good_url = response_url
484 |                             # print("GOOD: {}, {}".format(status_code, response_url))
485 |                         else:
486 |                             # These can sometimes resolve further though, so we continue onward
487 |                             prev_path = None
488 |                             prev_query = None
489 |                     else:
490 |                         if status_code not in BAD_STATUS_CODES:
491 |                             last_good_url = response_url
492 |                         else:
493 |                             break
494 |                 else:
495 |                     # Resolved to a general URL
496 |                     break
497 | 
498 |             prev_was_shortener = is_shortener
499 |             prev_path = parsed.path
500 |             prev_query = parsed.query
501 | 
502 |         if status_code not in BAD_STATUS_CODES:
503 |             # If the URL ended on a good status code, we'll try to trim out any unnecessary GET parameters
504 |             last_good_url = trim_get_parameters(
505 |                 last_good_url, session=session, timeout=timeout, user_agent=user_agent
506 |             )
507 | 
508 |         url = last_good_url
509 | 
510 |     if close_session:
511 |         session.close()
512 | 
513 |     return url
514 | 


--------------------------------------------------------------------------------
/pewtils/__init__.py:
--------------------------------------------------------------------------------
   1 | from __future__ import absolute_import
   2 | import chardet
   3 | import copy
   4 | import json
   5 | import imp
   6 | import multiprocessing
   7 | import os
   8 | import re
   9 | import signal
  10 | import sys
  11 | import time
  12 | import warnings
  13 | import zipcodes
  14 | 
  15 | try:
  16 |     from importlib.machinery import SourceFileLoader
  17 | except ImportError:
  18 |     import imp
  19 | 
  20 | import pandas as pd
  21 | import numpy as np
  22 | 
  23 | from contextlib import closing
  24 | from hashlib import md5
  25 | from random import uniform
  26 | from scandir import walk
  27 | from unidecode import unidecode
  28 | 
  29 | 
  30 | class classproperty(object):
  31 | 
  32 |     """
  33 |     This decorator allows you to define functions on a class that are accessible directly from the
  34 |     class itself (rather than an instance of the class). It allows you to access ``classproperty``
  35 |     attributes directly, such as ``obj.property``, rather than as a function on a class instance
  36 |     (like ``obj = Obj(); obj.property()``).
  37 | 
  38 |     Borrowed from a StackOverflow `post <https://stackoverflow.com/a/3203659>`_.
  39 | 
  40 |     Usage::
  41 | 
  42 |         from pewtils import classproperty
  43 | 
  44 |         class MyClass(object):
  45 |             x = 4
  46 | 
  47 |             @classproperty
  48 |             def number(cls):
  49 |                 return cls.x
  50 | 
  51 |         >>> MyClass().number
  52 |         4
  53 |         >>> MyClass.number
  54 |         4
  55 |     """
  56 | 
  57 |     def __init__(self, fget):
  58 |         self.fget = fget
  59 | 
  60 |     def __get__(self, owner_self, owner_cls):
  61 |         return self.fget(owner_cls)
  62 | 
  63 | 
  64 | def is_not_null(val, empty_lists_are_null=False, custom_nulls=None):
  65 | 
  66 |     """
  67 |     Checks whether the value is null, using a variety of potential string values, etc. The following values are always
  68 |     considered null: ``numpy.nan, None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"``
  69 | 
  70 |     :param val: The value to check
  71 |     :param empty_lists_are_null: Whether or not an empty list or :py:class:`pandas.DataFrame` should be considered \
  72 |     null (default=False)
  73 |     :type empty_lists_are_null: bool
  74 |     :param custom_nulls: an optional list of additional values to consider as null
  75 |     :type custom_nulls: list
  76 |     :return: True if the value is not null
  77 |     :rtype: bool
  78 | 
  79 |     Usage::
  80 | 
  81 |         from pewtils import is_not_null
  82 | 
  83 |         >>> text = "Hello"
  84 |         >>> is_not_null(text)
  85 |         True
  86 |     """
  87 | 
  88 |     null_values = [None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"]
  89 |     if custom_nulls:
  90 |         null_values.extend(custom_nulls)
  91 |     if type(val) == list:
  92 |         if empty_lists_are_null and val == []:
  93 |             return False
  94 |         else:
  95 |             return True
  96 |     elif isinstance(val, pd.Series) or isinstance(val, pd.DataFrame):
  97 |         if empty_lists_are_null and len(val) == 0:
  98 |             return False
  99 |         else:
 100 |             return True
 101 |     else:
 102 |         try:
 103 |             try:
 104 |                 good = val not in null_values
 105 |                 if good:
 106 |                     try:
 107 |                         try:
 108 |                             good = not pd.isnull(val)
 109 |                         except IndexError:
 110 |                             good = True
 111 |                     except AttributeError:
 112 |                         good = True
 113 |                 return good
 114 |             except ValueError:
 115 |                 return val.any()
 116 |         except TypeError:
 117 |             return not isinstance(val, None)
 118 | 
 119 | 
 120 | def is_null(val, empty_lists_are_null=False, custom_nulls=None):
 121 | 
 122 |     """
 123 |     Returns the opposite of the outcome of :py:func:`pewtils.is_not_null`. The following values are always \
 124 |     considered null: ``numpy.nan, None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"``
 125 | 
 126 |     :param val: The value to check
 127 |     :param empty_lists_are_null: Whether or not an empty list or :py:class:`pandas.DataFrame` should be considered \
 128 |     null (default=False)
 129 |     :type empty_lists_are_null: bool
 130 |     :param custom_nulls: an optional list of additional values to consider as null
 131 |     :type custom_nulls: list
 132 |     :return: True if the value is null
 133 |     :rtype: bool
 134 | 
 135 |     Usage::
 136 | 
 137 |         from pewtils import is_null
 138 | 
 139 |         >>> empty_list = []
 140 |         >>> is_null(empty_list, empty_lists_are_null=True)
 141 |         True
 142 |     """
 143 | 
 144 |     return not is_not_null(
 145 |         val, empty_lists_are_null=empty_lists_are_null, custom_nulls=custom_nulls
 146 |     )
 147 | 
 148 | 
 149 | def decode_text(text, throw_loud_fail=False):
 150 | 
 151 |     """
 152 |     Attempts to decode and re-encode text as ASCII. In the case of failure, it will attempt to detect the string's \
 153 |     encoding, decode it, and convert it to ASCII. If both these attempts fail, it will attempt to use the \
 154 |     :py:mod:`unidecode` package to transliterate into ASCII. And finally, if that doesn't work, it will forcibly \
 155 |     encode the text as ASCII and ignore non-ASCII characters.
 156 | 
 157 |     .. warning:: This function is potentially destructive to source input and should be used with some care. \
 158 |         Input text that cannot be decoded may be stripped out, or replaced with a similar ASCII character or other \
 159 |         placeholder, potentially resulting in an empty string.
 160 | 
 161 |     :param text: The text to process
 162 |     :type text: str
 163 |     :param throw_loud_fail: If True, exceptions will be raised, otherwise the function will fail silently and \
 164 |     return an empty string (default False)
 165 |     :type throw_loud_fail: bool
 166 |     :return: Decoded text, or empty string
 167 |     :rtype: str
 168 | 
 169 |     .. note:: In Python 3, the decode/encode attempts will fail by default, and the :py:mod:`unidecode` package will \
 170 |         be used to transliterate. In general, you shouldn't need to use this function in Python 3, but it shouldn't \
 171 |         hurt anything if you do.
 172 | 
 173 |     """
 174 | 
 175 |     output_text = ""
 176 |     with warnings.catch_warnings():
 177 |         warnings.simplefilter("ignore")
 178 |         if is_not_null(text):
 179 |             try:
 180 |                 text = u"{}".format(text)
 181 |                 output_text = text.decode("ascii").encode("ascii")
 182 |             except (AttributeError, TypeError, UnicodeEncodeError, UnicodeDecodeError):
 183 |                 try:
 184 |                     output_text = text.decode(chardet.detect(text)["encoding"])
 185 |                     output_text = output_text.encode("ascii")
 186 |                 except (
 187 |                     AttributeError,
 188 |                     TypeError,
 189 |                     UnicodeEncodeError,
 190 |                     UnicodeDecodeError,
 191 |                 ):
 192 |                     try:
 193 |                         output_text = unidecode(text)
 194 |                     except (
 195 |                         AttributeError,
 196 |                         TypeError,
 197 |                         UnicodeEncodeError,
 198 |                         UnicodeDecodeError,
 199 |                     ):
 200 |                         if throw_loud_fail:
 201 |                             output_text = text.decode("ascii", "ignore").encode("ascii")
 202 |                         else:
 203 |                             try:
 204 |                                 output_text = text.decode("ascii", "ignore").encode(
 205 |                                     "ascii"
 206 |                                 )
 207 |                             except (
 208 |                                 AttributeError,
 209 |                                 TypeError,
 210 |                                 UnicodeEncodeError,
 211 |                                 UnicodeDecodeError,
 212 |                             ):
 213 |                                 print("Could not decode")
 214 |                                 print(text)
 215 |                 output_text = output_text.replace("\x00", "").replace("\u0000", "")
 216 | 
 217 |     return output_text
 218 | 
 219 | 
 220 | def get_hash(text, hash_function="ssdeep"):
 221 | 
 222 |     """
 223 |     Generates hashed text using one of several available hashing functions.
 224 | 
 225 |     :param text: The string to hash
 226 |     :type text: str
 227 |     :param hash_function: The specific algorithm to use; options are ``'nilsimsa'``, ``'md5'``, and ``'ssdeep'`` \
 228 |     (default)
 229 |     :type hash_function: str
 230 |     :return: A hashed representation of the provided string
 231 |     :rtype: str
 232 | 
 233 |     .. note:: The string will be passed through :py:func:`pewtils.decode_text` and the returned value will be used \
 234 |     instead of the original value if it runs successfully, in order to ensure consistent hashing in both Python 2 and \
 235 |     3. By default the function uses the :py:mod:`ssdeep` algorithm, which generates context-sensitive hashes that are \
 236 |     useful for computing document similarities at scale.
 237 | 
 238 |     .. note:: Using `hash_function='ssdeep'` requires the :py:mod:`ssdeep` library, which is not installed by default \
 239 |     because it requires the installation of additional system libraries on certain operating systems. For help \
 240 |     installing ssdeep, refer to the pewtils documentation installation section, which provides OS-specific instructions.
 241 | 
 242 |     Usage::
 243 | 
 244 |         from pewtils import get_hash
 245 | 
 246 |         >>> text = 'test_string'
 247 |         >>> get_hash(text)
 248 |         '3:HI2:Hl'
 249 |     """
 250 | 
 251 |     decoded_text = decode_text(text).encode("utf8").strip()
 252 |     if decoded_text == "":
 253 |         decoded_text = text
 254 |     text = decoded_text
 255 |     if hash_function == "nilsimsa":
 256 |         from nilsimsa import Nilsimsa
 257 | 
 258 |         hashed = Nilsimsa(text).hexdigest()
 259 |     elif hash_function == "md5":
 260 |         hashed = md5(text).hexdigest()
 261 |     else:
 262 |         try:
 263 |             import ssdeep
 264 |         except ImportError:
 265 |             raise Exception(
 266 |                 """
 267 |                 To use get_hash with hash_function='ssdeep' you need to install the ssdeep package. Try running: 
 268 |                     >> BUILD_LIB=1 pip install ssdeep
 269 |                 If you encounter installation problems, refer to the pewtils documentation for troubleshooting help.
 270 |             """
 271 |             )
 272 |         hashed = ssdeep.hash(text)
 273 | 
 274 |     return hashed
 275 | 
 276 | 
 277 | def zipcode_num_to_string(zipcode):
 278 | 
 279 |     """
 280 |     Attempts to standardize a string/integer/float that contains a U.S. zipcode. Front-pads with zeroes and uses the \
 281 |     :py:mod:`zipcodes` library to ensure that the zipcode is real. If the zipcode doesn't validate successfully, \
 282 |     ``None`` will be returned.
 283 | 
 284 |     :param zip: Object that contains a sequence of digits (string, integer, float)
 285 |     :type zip: str or float or int
 286 |     :return: A 5-digit string, or None
 287 |     :rtype: str or NoneType
 288 | 
 289 |     Usage::
 290 | 
 291 |         from pewtils import zipcode_num_to_string
 292 | 
 293 |         >>> zipcode_number = 6463
 294 |         >>> zipcode_num_to_string(zipcode_number)
 295 |         '06463'
 296 |         >>> not_zipcode_number = 345678
 297 |         >>> zipcode_num_to_string(not_zipcode_number)
 298 |         >>>
 299 |     """
 300 | 
 301 |     if is_not_null(zipcode):
 302 | 
 303 |         try:
 304 |             zipcode = str(int(str(zipcode).strip()[:5].split(".")[0]))
 305 |         except (TypeError, ValueError):
 306 |             zipcode = None
 307 | 
 308 |         if zipcode:
 309 |             zipcode = zipcode.zfill(5)
 310 |             if zipcodes.is_real(zipcode):
 311 |                 return zipcode
 312 |             else:
 313 |                 return None
 314 |     else:
 315 | 
 316 |         zipcode = None
 317 | 
 318 |     return zipcode
 319 | 
 320 | 
 321 | def concat_text(*args):
 322 | 
 323 |     """
 324 |     A helper function for concatenating text values. Text values are passed through :py:func:`pewtils.decode_text` \
 325 |     before concatenation.
 326 | 
 327 |     :param args: A list of text values that will be returned as a single space-separated string
 328 |     :type args: list
 329 |     :return: A single string of the values concatenated by spaces
 330 |     :rtype: str
 331 | 
 332 |     Usage::
 333 | 
 334 |         from pewtils import concat_text
 335 | 
 336 |         >>> text_list = ['Hello', 'World', '!']
 337 |         >>> concat_text(text_list)
 338 |         'Hello World !'
 339 |     """
 340 | 
 341 |     strs = [decode_text(arg) for arg in args if is_not_null(arg)]
 342 |     return " ".join(strs) if is_not_null(strs, empty_lists_are_null=True) else ""
 343 | 
 344 | 
 345 | def vector_concat_text(*args):
 346 | 
 347 |     """
 348 |     Takes a list of equal-length lists and returns a single list with the rows concatenated by spaces. Useful for \
 349 |     merging multiple columns of text in Pandas.
 350 | 
 351 |     :param args: A list of lists or :py:class:`pandas.Series` s that contain text values
 352 |     :return: A single list or :py:class:`pandas.Series` with all of the text values for each row concatenated
 353 | 
 354 |     Usage with lists::
 355 | 
 356 |         from pewtils import vector_concat_text
 357 | 
 358 |         >>> text_lists = ["one", "two", "three"], ["a", "b", "c"]
 359 |         >>> vector_concat_text(text_lists)
 360 |         ['one a', 'two b', 'three c']
 361 | 
 362 |     Usage with Pandas::
 363 | 
 364 |         import pandas as pd
 365 |         from pewtils import vector_concat_text
 366 | 
 367 |         df = pd.DataFrame([
 368 |             {"text1": "one", "text2": "a"},
 369 |             {"text1": "two", "text2": "b"},
 370 |             {"text1": "three", "text2": "c"}
 371 |         ])
 372 | 
 373 |         >>> df['text'] = vector_concat_text(df['text1'], df['text2'])
 374 |         >>> df['text']
 375 |         0      one a
 376 |         1      two b
 377 |         2    three c
 378 |         Name: text, dtype: object
 379 |     """
 380 | 
 381 |     return np.vectorize(concat_text)(*args)
 382 | 
 383 | 
 384 | def scale_range(old_val, old_min, old_max, new_min, new_max):
 385 | 
 386 |     """
 387 |     Scales a value from one range to another.  Useful for comparing values from different scales, for example.
 388 | 
 389 |     :param old_val: The value to convert
 390 |     :type old_val: int or float
 391 |     :param old_min: The minimum of the old range
 392 |     :type old_min: int or float
 393 |     :param old_max: The maximum of the old range
 394 |     :type old_max: int or float
 395 |     :param new_min: The minimum of the new range
 396 |     :type new_min: int or float
 397 |     :param new_max: The maximum of the new range
 398 |     :type new_max: int or float
 399 |     :return: Value equivalent from the new scale
 400 |     :rtype: float
 401 | 
 402 |     Usage::
 403 | 
 404 |         from pewtils import scale_range
 405 | 
 406 |         >>> old_value = 5
 407 |         >>> scale_range(old_value, 0, 10, 0, 20)
 408 |         10.0
 409 |     """
 410 | 
 411 |     return (
 412 |         ((float(old_val) - float(old_min)) * (float(new_max) - float(new_min)))
 413 |         / (float(old_max) - float(old_min))
 414 |     ) + float(new_min)
 415 | 
 416 | 
 417 | def new_random_number(attempt=1, minimum=1.0, maximum=10):
 418 | 
 419 |     """
 420 |     Returns a random number between the boundary that exponentially increases with the number of ``attempt``.
 421 |     The upper bound is capped using the ``maximum`` parameter (default 10) but is otherwise determined by the
 422 |     function ``minimum * 2 ** attempt``.
 423 | 
 424 |     | In effect, this means that when ``attempt`` is 1, the number returned will be in the range of the minimum \
 425 |     and twice the minimum's value.  As you increase ``attempt``, the possible range of returned values expands \
 426 |     exponentially until it hits the ``maximum`` ceiling.
 427 | 
 428 |     :param attempt: Increasing attempt will expand the upper-bound of the range from which the random number is drawn
 429 |     :type attempt: int
 430 |     :param minimum: The minimum allowed value that can be returned; must be greater than zero.
 431 |     :type minimum: int or float
 432 |     :param maximum: The maximum allowed value that can be returned; must be greater than ``minimum``.
 433 |     :type maximum: int or float
 434 |     :return: A random number drawn uniformly from across the range determined by the provided arguments.
 435 |     :rtype: float
 436 | 
 437 |     .. note:: One useful application of this function is rate limiting: a script can pause in between requests at a \
 438 |         reasonably fast pace, but then moderate itself and pause for longer periods if it begins encountering errors, \
 439 |         simply by increasing the ``attempt`` variable (hence its name).
 440 | 
 441 |     Usage::
 442 | 
 443 |         from pewtils import new_random_number
 444 | 
 445 |         >>> new_random_number(attempt=1)
 446 |         1.9835581813820642
 447 |         >>> new_random_number(attempt=2)
 448 |         3.1022350739064
 449 |     """
 450 | 
 451 |     return uniform(minimum, min(maximum, minimum * 2 ** attempt))
 452 | 
 453 | 
 454 | def chunk_list(seq, size):
 455 | 
 456 |     """
 457 |     Takes a sequence and groups values into smaller lists based on the specified size.
 458 | 
 459 |     :param seq: List or a list-like iterable
 460 |     :type seq: list or iterable
 461 |     :param size: Desired size of each sublist
 462 |     :type size: int
 463 |     :return: A list of lists
 464 |     :rtype: list
 465 | 
 466 |     Usage::
 467 | 
 468 |         from pewtils import chunk_list
 469 | 
 470 |         >>> number_sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
 471 |         >>> chunk_list(number_sequence, 3)
 472 |         [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]]
 473 |     """
 474 | 
 475 |     return (seq[pos : (pos + size)] for pos in range(0, len(seq), size))
 476 | 
 477 | 
 478 | def flatten_list(l):
 479 | 
 480 |     """
 481 |     Takes a list of lists and flattens it into a single list. Nice shortcut to avoid having to deal with list \
 482 |     comprehension.
 483 | 
 484 |     :param l: A list of lists
 485 |     :type l: list
 486 |     :return: A flattened list of all of the elements contained in the original list of lists
 487 |     :rtype: list
 488 | 
 489 |     Usage::
 490 | 
 491 |         from pewtils import flatten_list
 492 | 
 493 |         >>> nested_lists = [[1, 2, 3], [4, 5, 6]]
 494 |         >>> flatten_list(nested_lists)
 495 |         [1, 2, 3, 4, 5, 6]
 496 |     """
 497 | 
 498 |     return [item for sublist in l for item in sublist]
 499 | 
 500 | 
 501 | def scan_dictionary(search_dict, field):
 502 | 
 503 |     """
 504 |     Takes a dictionary with nested lists and dictionaries, and searches recursively for a specific key. Since keys can
 505 |     occur more than once, the function returns a list of all of the found values along with a list of equal length
 506 |     that specifies the nested key path to each value.
 507 | 
 508 |     :param search_dict: The dictionary to search
 509 |     :type search_dict: dict
 510 |     :param field: The field to find
 511 |     :type field: str
 512 |     :return: A tuple of the found values and file path-style strings representing their locations
 513 |     :rtype: tuple
 514 | 
 515 |     Usage::
 516 | 
 517 |         from pewtils import scan_dictionary
 518 | 
 519 |         >>> test_dict = {"one": {"two": {"three": "four"}}}
 520 |         >>> scan_dictionary(test_dict, "three")
 521 |         (['four'], ['one/two/three/'])
 522 |         >>> scan_dictionary(test_dict, "five")
 523 |         ([], [])
 524 |     """
 525 | 
 526 |     fields_found = []
 527 |     key_path = []
 528 | 
 529 |     for key, value in search_dict.items():
 530 |         if key == field:
 531 |             fields_found.append(value)
 532 |             new_str = str(key) + "/"
 533 |             key_path.append(new_str)
 534 | 
 535 |         elif isinstance(value, dict):
 536 |             results, path = scan_dictionary(value, field)
 537 |             for result in results:
 538 |                 fields_found.append(result)
 539 |             for road in path:
 540 |                 new_str = str(key) + "/" + road
 541 |                 key_path.append(new_str)
 542 | 
 543 |         elif isinstance(value, list):
 544 |             for item in value:
 545 |                 if isinstance(item, dict):
 546 |                     more_results, more_path = scan_dictionary(item, field)
 547 |                     for another_result in more_results:
 548 |                         fields_found.append(another_result)
 549 |                     for another_road in more_path:
 550 |                         new_str = str(key) + "/" + another_road
 551 |                         key_path.append(new_str)
 552 | 
 553 |     return fields_found, key_path
 554 | 
 555 | 
 556 | def recursive_update(existing, new):
 557 | 
 558 |     """
 559 |     Takes an object and a dictionary representation of attributes and values, and recursively traverses through the
 560 |     new values and updates the object.
 561 | 
 562 |     | Regardless of whether or not the keys in the dictionary correspond to attribute names or dictionary keys; \
 563 |     you can use this to iterate through a nested hierarchy of objects and dictionaries and update whatever you like.
 564 | 
 565 |     :param existing: An object or dictionary
 566 |     :type existing: dict or object
 567 |     :param new: A dictionary where keys correspond to the names of keys in the existing dictionary or attributes on \
 568 |     the existing object
 569 |     :type new: dict or object
 570 |     :return: A copy of the original object or dictionary, with the values updated based on the provided map
 571 |     :rtype: dict or object
 572 | 
 573 |     Usage::
 574 | 
 575 |         from pewtils import recursive_update
 576 | 
 577 |         class TestObject(object):
 578 |             def __init__(self, value):
 579 |                 self.value = value
 580 |                 self.dict = {"obj_key": "original"}
 581 |             def __repr__(self):
 582 |                 return("TestObject(value='{}', dict={})".format(self.value, self.dict))
 583 | 
 584 |         original = {
 585 |             "object": TestObject("original"),
 586 |             "key1": {"key2": "original"}
 587 |         }
 588 |         update = {
 589 |             "object": {"value": "updated", "dict": {"obj_key": "updated"}},
 590 |             "key1": {"key3": "new"}
 591 |         }
 592 | 
 593 |         >>> recursive_update(original, update)
 594 |         {'object': TestObject(value='updated', dict={'obj_key': 'updated'}),
 595 |          'key1': {'key2': 'original', 'key3': 'new'}}
 596 | 
 597 |     """
 598 | 
 599 |     def _hasattr(obj, attr):
 600 |         if isinstance(obj, dict):
 601 |             return attr in obj
 602 |         else:
 603 |             return hasattr(obj, attr)
 604 | 
 605 |     def _setattr(obj, attr, val):
 606 |         if isinstance(obj, dict):
 607 |             obj[attr] = val
 608 |         else:
 609 |             setattr(obj, attr, val)
 610 |         return obj
 611 | 
 612 |     def _getattr(obj, attr):
 613 |         if isinstance(obj, dict):
 614 |             return obj[attr]
 615 |         else:
 616 |             return getattr(obj, attr)
 617 | 
 618 |     existing = copy.deepcopy(existing)
 619 |     if isinstance(new, dict):
 620 |         for k, v in new.items():
 621 | 
 622 |             if _hasattr(existing, k):
 623 |                 _setattr(
 624 |                     existing,
 625 |                     k,
 626 |                     recursive_update(_getattr(existing, k), _getattr(new, k)),
 627 |                 )
 628 |             else:
 629 |                 _setattr(existing, k, _getattr(new, k))
 630 |         return existing
 631 |     else:
 632 |         return new
 633 | 
 634 | 
 635 | def cached_series_mapper(series, function):
 636 | 
 637 |     """
 638 |     Applies a function to all of the unique values in a :py:class:`pandas.Series` to avoid repeating the operation \
 639 |     on duplicate values.
 640 | 
 641 |     | Great if you're doing database lookups or something computationally intensive on a column that may contain \
 642 |     repeating values, etc.
 643 | 
 644 |     :param series: A :py:class:`pandas.Series`
 645 |     :type series: :py:class:`pandas.Series`
 646 |     :param function: A function to apply to values in the :py:class:`pandas.Series`
 647 |     :return: The resulting :py:class:`pandas.Series`
 648 |     :rtype: :py:class:`pandas.Series`
 649 | 
 650 |     Usage::
 651 | 
 652 |         import pandas as pd
 653 |         from pewtils import cached_series_mapper
 654 | 
 655 |         values = ["value"]*10
 656 |         def my_function(x):
 657 |             print(x)
 658 |             return x
 659 | 
 660 |         df = pd.DataFrame(values, columns=['column'])
 661 |         >>> mapped = df['column'].map(my_function)
 662 |         value
 663 |         value
 664 |         value
 665 |         value
 666 |         value
 667 |         value
 668 |         value
 669 |         value
 670 |         value
 671 |         value
 672 |         >>> mapped = cached_series_mapper(df['column'], my_function)
 673 |         value
 674 |     """
 675 | 
 676 |     val_map = {}
 677 |     for val in series.unique():
 678 |         val_map[val] = function(val)
 679 | 
 680 |     return series.map(val_map)
 681 | 
 682 | 
 683 | def multiprocess_group_apply(grp, func, *args, **kwargs):
 684 |     """
 685 | 
 686 |     Apply arbitrary functions to groups or slices of a Pandas DataFrame using multiprocessing, to efficiently \
 687 |     map or aggregate data. Each group gets processed in parallel, and the results are concatenated together after \
 688 |     all processing has finished. If you pass a function that aggregates each group into a single value, you'll get \
 689 |     back a DataFrame with one row for each group, as though you had performed a `.agg` function. \
 690 |     If you pass a function that returns a value for each _row_ in the group, then you'll get back a DataFrame \
 691 |     in your original shape. In this case, you would simply be using grouping to efficiently apply a row-level operation.
 692 | 
 693 |     :param grp: A Pandas DataFrameGroupBy object
 694 |     :type grp: pandas.core.groupby.generic.DataFrameGroupBy
 695 |     :param func: A function that accepts a Pandas DataFrame representing a group from the original DataFrame
 696 |     :type func: function
 697 |     :param args: Arguments to be passed to the function
 698 |     :param kwargs: Keyword arguments to be passed to the function
 699 |     :return: The resulting DataFrame
 700 |     :rtype: pandas.DataFrame
 701 | 
 702 |     Usage::
 703 | 
 704 |         df = pd.DataFrame([
 705 |             {"group": 1, "value": "one two three"},
 706 |             {"group": 1, "value": "one two three four"},
 707 |             {"group": 2, "value": "one two"}
 708 |         ])
 709 | 
 710 |         ### For efficient aggregation
 711 | 
 712 |         def get_length(grp):
 713 |             # Simple function that returns the number of rows in each group
 714 |             return len(grp)
 715 | 
 716 |         >>> df.groupby("group_col").apply(lambda x: len(x))
 717 |         1    2
 718 |         2    1
 719 |         dtype: int64
 720 |         >>> multiprocess_group_apply(df.groupby("group_col"), get_length)
 721 |         1    2
 722 |         2    1
 723 |         dtype: int64
 724 | 
 725 |         ### For efficient mapping
 726 | 
 727 |         def get_value_length(grp):
 728 |             # Simple function that returns the word count of each row in the group
 729 |             return grp['value'].map(lambda x: len(x.split()))
 730 | 
 731 |         >>> df['value'].map(lambda x: len(x.split()))
 732 |         0    3
 733 |         1    4
 734 |         2    2
 735 |         Name: value, dtype: int64
 736 |         >>> multiprocess_group_apply(df.groupby("group_col"), get_value_length)
 737 |         0    3
 738 |         1    4
 739 |         2    2
 740 |         Name: value, dtype: int64
 741 | 
 742 |         # If you just want to efficiently map a function to your DataFrame and you want to evenly split your
 743 |         # DataFrame into groups, you could do the following:
 744 | 
 745 |         df["group_col"] = (df.reset_index().index.values / (len(df) / multiprocessing.cpu_count())).astype(int)
 746 |         df["mapped_value"] = multiprocess_group_apply(df.groupby("group_col"), get_value_length)
 747 |         del df["group_col"]
 748 | 
 749 |     """
 750 | 
 751 |     results = []
 752 |     pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
 753 |     for name, group in grp:
 754 |         results.append(pool.apply_async(func, (group,) + args, kwargs))
 755 |     pool.close()
 756 |     pool.join()
 757 |     results = [r.get() for r in results]
 758 | 
 759 |     if not hasattr(results[0], "__len__") or isinstance(results[0], str):
 760 |         # Assume it's an aggregation function
 761 |         return pd.Series(results, index=[g for g, _ in grp])
 762 |     else:
 763 |         # Assume you're just mapping the function normally and using the groups to split the data
 764 |         return pd.concat(results)
 765 | 
 766 | 
 767 | def extract_json_from_folder(
 768 |     folder_path, include_subdirs=False, concat_subdir_names=False
 769 | ):
 770 | 
 771 |     """
 772 |     Takes a folder path and traverses it, looking for JSON files. When it finds one, it adds it to a dictionary,
 773 |     with the key being the name of the file and the value being the JSON itself. This is useful if you store \
 774 |     configurations or various metadata in a nested folder structure, which we do for things like content analysis \
 775 |     codebooks.
 776 | 
 777 |     | Has options for recursively traversing a folder, and for optionally concatenating the subfolder names \
 778 |     into the dictionary keys as prefixes.
 779 | 
 780 |     :param folder_path: The path of the folder to scan
 781 |     :type folder_path: str
 782 |     :param include_subdirs: Whether or not to recursively scan subfolders
 783 |     :type include_subdirs: bool
 784 |     :param concat_subdir_names: Whether or not to prefix the dictionary keys with the names of subfolders
 785 |     :type concat_subdir_names: bool
 786 |     :return: A dictionary containing all of the abstracted JSON files as values
 787 |     :rtype: dict
 788 | 
 789 |     Usage::
 790 | 
 791 |         # For example, let's say we have the following folder structure
 792 |         # with various JSON codebooks scattered about:
 793 |         #
 794 |         # /codebooks
 795 |         #     /logos
 796 |         #         /antipathy.json
 797 |         #     /atp_open_ends
 798 |         #         /w29
 799 |         #             /sources_of_meaning.json
 800 |         #
 801 |         # Here's what we'd get depending on the different parameters we use:
 802 | 
 803 |         from pewtils import extract_json_from_folder
 804 |         >>> extract_json_from_folder("codebooks", include_subdirs=False, concat_subdir_names=False)
 805 |         {}
 806 |         >>> extract_json_from_folder("codebooks", include_subdirs=True, concat_subdir_names=False)
 807 |         {
 808 |             "logos": {"antipathy": "json would be here"},
 809 |             "atp_open_ends": {"w29": {"sources_of_meaning": "json would be here"}}
 810 |         }
 811 |         >>> extract_json_from_folder("codebooks", include_subdirs=True, concat_subdir_names=True)
 812 |         {
 813 |             "logos_antipathy": "json would be here",
 814 |             "atp_open_ends_w29_sources_of_meaning": "json would be here"
 815 |         }
 816 |     """
 817 | 
 818 |     attributes = {}
 819 |     subdirs = []
 820 |     if os.path.exists(folder_path):
 821 |         for path, subdir, files in walk(folder_path):
 822 |             if folder_path == path:
 823 |                 for file in files:
 824 |                     if file.endswith(".json"):
 825 |                         key = re.sub(".json", "", file)
 826 |                         with closing(open(os.path.join(path, file), "r")) as infile:
 827 |                             try:
 828 |                                 attributes[key] = json.load(infile)
 829 |                             except ValueError:
 830 |                                 print("JSON file is invalid: {}".format(file))
 831 |             if subdir:
 832 |                 subdirs.append(subdir)
 833 | 
 834 |     if include_subdirs and len(subdirs) > 0:
 835 |         for subdir in subdirs[0]:
 836 |             if subdir != "__pycache__":
 837 |                 results = extract_json_from_folder(
 838 |                     os.path.join(folder_path, subdir),
 839 |                     include_subdirs=True,
 840 |                     concat_subdir_names=concat_subdir_names,
 841 |                 )
 842 |                 if not concat_subdir_names:
 843 |                     attributes[subdir] = results
 844 |                 else:
 845 |                     for subattr_name, subattr in results.items():
 846 |                         attributes["_".join([subdir, subattr_name])] = subattr
 847 | 
 848 |     return attributes
 849 | 
 850 | 
 851 | def extract_attributes_from_folder_modules(
 852 |     folder_path,
 853 |     attribute_name,
 854 |     include_subdirs=False,
 855 |     concat_subdir_names=False,
 856 |     current_subdirs=None,
 857 | ):
 858 | 
 859 |     """
 860 |     Takes a folder path and traverses it, looking for Python files that contain an attribute (i.e., class, function,
 861 |     etc.) with a given name. It extracts those attributes and returns a dictionary where the keys are the names of the
 862 |     files that contained the attributes, and the values are the attributes themselves.
 863 | 
 864 |     This operates exactly the same as :py:func:`pewtils.extract_json_from_folder` except instead of reading JSON files
 865 |     and adding them as values in the dictionary that gets returned, this function will instead look for Python files
 866 |     that contain a function, class, method, or attribute with the name you provide in ``attribute_name`` and will load
 867 |     that attribute in as the values.
 868 | 
 869 |     :param folder_path: The path of a folder/module to scan
 870 |     :type folder_path: str
 871 |     :param attribute_name: The name of the attribute (class, function, variable, etc.) to extract from files
 872 |     :type attribute_name: str
 873 |     :param include_subdirs: Whether or not to recursively scan subfolders
 874 |     :type include_subdirs: bool
 875 |     :param concat_subdir_names: Whether or not to prefix the dictionary keys with the names of subfolders
 876 |     :type concat_subdir_names: bool
 877 |     :param current_subdirs: Used to track location when recursively iterating a module (do not use)
 878 |     :return: A dictionary with all of the extracted attributes as values
 879 |     :rtype: dict
 880 | 
 881 |     .. note:: if you use Python 2.7 you will need to add ``from __future__ import absolute_import`` to the top of files \
 882 |         that you want to scan and import using this function.
 883 |     """
 884 | 
 885 |     if not folder_path.startswith(os.getcwd()):
 886 |         folder_path = os.path.join(os.getcwd(), folder_path)
 887 |     test_path, _ = os.path.split(folder_path)
 888 |     while test_path != "/":
 889 |         if "__init__.py" not in os.listdir(test_path):
 890 |             break
 891 |         test_path, _ = os.path.split(test_path)
 892 |     module_location = test_path
 893 | 
 894 |     current_folder = folder_path.split("/")[-1]
 895 |     if not current_subdirs:
 896 |         current_subdirs = []
 897 | 
 898 |     attributes = {}
 899 |     subdirs = []
 900 |     if os.path.exists(folder_path):
 901 |         for path, subdir_list, files in walk(folder_path):
 902 |             if folder_path == path:
 903 |                 for file in files:
 904 |                     if file.endswith(".py") and not file.startswith("__init__"):
 905 |                         file_name = file.split(".")[0]
 906 |                         module_name = re.sub(
 907 |                             "/",
 908 |                             ".",
 909 |                             re.sub(
 910 |                                 module_location,
 911 |                                 "",
 912 |                                 os.path.splitext(os.path.join(path, file))[0],
 913 |                             ),
 914 |                         ).strip(".")
 915 |                         if module_name in sys.modules:
 916 |                             module = sys.modules[module_name]
 917 |                             # https://github.com/ansible/ansible/issues/13110
 918 |                         else:
 919 |                             try:
 920 |                                 module = SourceFileLoader(
 921 |                                     module_name, os.path.join(path, file)
 922 |                                 ).load_module()
 923 |                             except NameError:
 924 |                                 file, pathname, description = imp.find_module(
 925 |                                     file_name, [path]
 926 |                                 )
 927 |                                 warnings.simplefilter("error", RuntimeWarning)
 928 |                                 try:
 929 |                                     module = imp.load_module(
 930 |                                         module_name, file, pathname, description
 931 |                                     )
 932 |                                 except RuntimeWarning:
 933 |                                     try:
 934 |                                         module = imp.load_module(
 935 |                                             module_name.split(".")[-1],
 936 |                                             file,
 937 |                                             pathname,
 938 |                                             description,
 939 |                                         )
 940 |                                     except RuntimeWarning:
 941 |                                         module = None
 942 |                                     except (ImportError, AttributeError):
 943 |                                         module = None
 944 |                                 except (ImportError, AttributeError):
 945 |                                     module = None
 946 |                         if hasattr(module, attribute_name):
 947 |                             attributes[file_name] = getattr(module, attribute_name)
 948 | 
 949 |             if subdir_list:
 950 |                 subdirs.extend(subdir_list)
 951 | 
 952 |     if include_subdirs:
 953 |         for subdir in set(subdirs):
 954 |             results = extract_attributes_from_folder_modules(
 955 |                 os.path.join(folder_path, subdir),
 956 |                 attribute_name,
 957 |                 concat_subdir_names=concat_subdir_names,
 958 |                 include_subdirs=True,
 959 |                 current_subdirs=current_subdirs + [current_folder],
 960 |             )
 961 |             if not concat_subdir_names:
 962 |                 attributes[subdir] = results
 963 |             else:
 964 |                 for subattr_name, subattr in results.items():
 965 |                     attributes["_".join([subdir, subattr_name])] = subattr
 966 | 
 967 |     if is_null(current_subdirs, empty_lists_are_null=True):
 968 |         for name in attributes.keys():
 969 |             try:
 970 |                 attributes[name]._name = name
 971 |             except AttributeError:
 972 |                 pass
 973 | 
 974 |     return attributes
 975 | 
 976 | 
 977 | class timeout_wrapper:
 978 |     def __init__(self, seconds=1, error_message="Timeout"):
 979 |         """
 980 |         Context manager that will raise an error if it takes longer than the specified number of seconds to execute.
 981 |         Found via this very helpful Stack Overflow post:
 982 |         https://stackoverflow.com/questions/2281850/timeout-function-if-it-takes-too-long-to-finish
 983 | 
 984 |         :param seconds: Number of seconds allowed for the code to execute
 985 |         :param error_message: Optional custom error message to raise
 986 |         """
 987 |         self.seconds = seconds
 988 |         self.error_message = error_message
 989 | 
 990 |     def handle_timeout(self, signum, frame):
 991 |         raise Exception(self.error_message)
 992 | 
 993 |     def __enter__(self):
 994 |         signal.signal(signal.SIGALRM, self.handle_timeout)
 995 |         signal.alarm(self.seconds)
 996 | 
 997 |     def __exit__(self, t, value, traceback):
 998 |         signal.alarm(0)
 999 | 
1000 | 
1001 | class PrintExecutionTime(object):
1002 | 
1003 |     """
1004 |     Simple context manager to print the time it takes for a block of code to execute
1005 | 
1006 |     :param label: A label to print alongside the execution time
1007 |     :param stdout: a StringIO-like output stream (sys.stdout by default)
1008 | 
1009 |     Usage::
1010 | 
1011 |         from pewtils import PrintExecutionTime
1012 | 
1013 |         >>> with PrintExecutionTime(label="my function"): time.sleep(5)
1014 |         my function: 5.004292011260986 seconds
1015 | 
1016 |     """
1017 | 
1018 |     def __init__(self, label=None, stdout=None):
1019 |         self.start_time = None
1020 |         self.end_time = None
1021 |         self.label = label
1022 |         self.stdout = sys.stdout if not stdout else stdout
1023 | 
1024 |     def __enter__(self):
1025 |         self.start_time = time.time()
1026 |         return self
1027 | 
1028 |     def __exit__(self, exc_type, exc_value, exc_traceback):
1029 |         self.end_time = time.time()
1030 |         if self.label:
1031 |             self.stdout.write(
1032 |                 "{}: {} seconds".format(self.label, self.end_time - self.start_time)
1033 |             )
1034 |         else:
1035 |             self.stdout.write("{} seconds".format(self.end_time - self.start_time))
1036 | 


--------------------------------------------------------------------------------