├── pewtils ├── VERSION ├── general_link_shorteners.csv ├── regex.py ├── vanity_link_shorteners.csv ├── io.py ├── http.py └── __init__.py ├── tests ├── files │ ├── __init__.py │ ├── subfolder │ │ ├── __init__.py │ │ ├── subfolder_json.json │ │ └── subfolder_py.py │ ├── json.json │ ├── py.py │ ├── example_stripped.html │ ├── example_stripped_simple.html │ └── example.html ├── __init__.py ├── regex.py ├── http.py ├── io.py └── base.py ├── docs_source ├── _static │ ├── .gitkeep │ └── theme_overrides.css ├── regex.rst ├── io.rst ├── http.rst ├── http_link_shorteners.rst ├── pewtils_core.rst ├── index.rst ├── conf.py └── examples.rst ├── MANIFEST.in ├── .gitignore ├── .bulldozer.yml ├── CHANGELOG.md ├── requirements.txt ├── .coveragerc ├── .bumpversion.cfg ├── .policy.yml ├── .github ├── workflows │ ├── build-docs.yaml │ ├── unit-tests.yaml │ ├── build-release.yaml │ └── build-main.yaml └── runner.yaml ├── setup.py ├── Makefile ├── CONTRIBUTING.md ├── README.md └── LICENSE /pewtils/VERSION: -------------------------------------------------------------------------------- 1 | 0.1.1 -------------------------------------------------------------------------------- /tests/files/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs_source/_static/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/files/subfolder/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/files/json.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_val": 1 3 | } -------------------------------------------------------------------------------- /tests/files/py.py: -------------------------------------------------------------------------------- 1 | def test(): 2 | return "test1" 3 | -------------------------------------------------------------------------------- /tests/files/subfolder/subfolder_json.json: -------------------------------------------------------------------------------- 1 | { 2 | "test_val": 2 3 | } -------------------------------------------------------------------------------- /tests/files/subfolder/subfolder_py.py: -------------------------------------------------------------------------------- 1 | def test(): 2 | return "test2" 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | include pewtils/*.csv 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env/ 2 | .ipynb_checkpoints 3 | *.pyc 4 | .python-version 5 | .idea/ 6 | _build/ 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .http import * 3 | from .io import * 4 | from .regex import * 5 | -------------------------------------------------------------------------------- /tests/files/example_stripped.html: -------------------------------------------------------------------------------- 1 | Example Domain 2 | 3 | This domain is established to be used for illustrative examples in documents. You may use this domain in examples without prior coordination or asking for permission. 4 | 5 | More information... -------------------------------------------------------------------------------- /.bulldozer.yml: -------------------------------------------------------------------------------- 1 | version: 1 2 | 3 | merge: 4 | method: merge 5 | whitelist: 6 | labels: ["automerge"] 7 | comment_substrings: ["==AUTOMERGE=="] 8 | blacklist: 9 | labels: ["block"] 10 | comment_substrings: ["==BLOCK=="] 11 | 12 | update: 13 | whitelist: 14 | labels: ["wip", "update me"] 15 | -------------------------------------------------------------------------------- /docs_source/regex.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | Regex Patterns 3 | ************** 4 | 5 | This module contains a modest but growing collection of useful regular expressions, useful for \ 6 | extracting things like URLs, monetary values, and more. 7 | 8 | .. automodule :: pewtils.regex 9 | :autosummary: 10 | :members: 11 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Release Notes 2 | 3 | ## 1.1.2 4 | 5 | - Fix some FileHandler interactions with S3 6 | - Code linting improvements 7 | 8 | ## 1.1.1 9 | 10 | - Release repo history 11 | 12 | ## 1.1.0 13 | 14 | - Replace FileHandler's dependency on boto2 with boto3, which is required for role-based authentication with AWS services 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Unidecode>=1.1.1 2 | beautifulsoup4>=4.10.0 3 | boto>=2.49.0 4 | chardet>=4.0.0 5 | fake_useragent>=0.1.11 6 | lxml>=4.4.2 7 | nilsimsa>=0.3.8 8 | numpy>=1.18.1 9 | pandas>=0.25.3 10 | requests>=2.25.1 11 | scandir>=1.10.0 12 | six>=1.16.0 13 | ssdeep>=3.4 14 | stopit>=1.1.2 15 | tldextract>=2.2.2 16 | zipcodes>=1.1.0 17 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [run] 3 | branch = False 4 | cover_pylib = False 5 | source = pewtils 6 | omit = 7 | */site-packages/* 8 | pewtils/internal/* 9 | 10 | [report] 11 | ignore_errors = True 12 | exclude_lines = 13 | pragma: no cover 14 | def __repr__ 15 | except 16 | omit = 17 | */site-packages/* 18 | pewtils/internal/* -------------------------------------------------------------------------------- /docs_source/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* override table width restrictions */ 2 | @media screen and (min-width: 767px) { 3 | 4 | .wy-table-responsive table td { 5 | /* !important prevents the common CSS stylesheets from overriding 6 | this as on RTD they are loaded after this stylesheet */ 7 | white-space: normal !important; 8 | } 9 | 10 | .wy-table-responsive { 11 | overflow: visible !important; 12 | } 13 | } -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 1.1.6.dev1 3 | commit = False 4 | tag = False 5 | parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P\w+)(?P\d+))? 6 | serialize = 7 | {major}.{minor}.{patch}.{release}{build} 8 | {major}.{minor}.{patch} 9 | 10 | [bumpversion:part:release] 11 | first_value = dev 12 | optional_value = prod 13 | values = 14 | dev 15 | prod 16 | 17 | [bumpversion:part:build] 18 | 19 | [bumpversion:file:setup.py] 20 | 21 | [bumpversion:file:docs_source/conf.py] 22 | -------------------------------------------------------------------------------- /docs_source/io.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | I/O Tools 3 | ************** 4 | 5 | This module contains utilities related to reading and writing files in a variety of formats. \ 6 | Right now, it consists exclusively of the :py:class:`pewtils.io.FileHandler` class, which provides \ 7 | a standardized interface for loading and saving data both locally and on Amazon S3. It doesn't \ 8 | always work exactly as intended, but 99% of the time, it gives us a way to read and write files \ 9 | with just one or two lines of code - and accordingly, we use it everywhere. We hope you do too! 10 | 11 | .. automodule :: pewtils.io 12 | :autosummary: 13 | :members: 14 | -------------------------------------------------------------------------------- /pewtils/general_link_shorteners.csv: -------------------------------------------------------------------------------- 1 | shortener 2 | abre.ai 3 | adf.ly 4 | bit.do 5 | bit.do 6 | bit.ly 7 | bitly.com 8 | buff.ly 9 | crwd.fr 10 | cutt.ly 11 | disq.us 12 | dlvr.it 13 | every.tw 14 | flip.it 15 | fus.in 16 | fw.to 17 | fwdaga.in 18 | goo.gl 19 | ht.ly 20 | ht.ly 21 | hub.am 22 | hubs.ly 23 | is.gd 24 | j.mp 25 | lnks.gd 26 | loom.ly 27 | lsh.re 28 | more.pr 29 | msgp.pl 30 | mvnt.us 31 | ora.cl 32 | ow.ly 33 | owl.li 34 | pgj.cc 35 | po.st 36 | qoo.ly 37 | rb.gy 38 | scr.bi 39 | shar.es 40 | shoo.ly 41 | shout.lt 42 | shr.gs 43 | snip.ly 44 | snp.tv 45 | spr.ly 46 | su.pr 47 | t.co 48 | tiny.cc 49 | tinyurl.com 50 | trib.al 51 | trib.it 52 | urlm.in 53 | v.ht 54 | wp.me -------------------------------------------------------------------------------- /tests/files/example_stripped_simple.html: -------------------------------------------------------------------------------- 1 | Example Domain body { background-color: #f0f0f2; margin: 0; padding: 0; font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif; } div { width: 600px; margin: 5em auto; padding: 50px; background-color: #fff; border-radius: 1em; } a:link, a:visited { color: #38488f; text-decoration: none; } @media (max-width: 700px) { body { background-color: #fff; } div { width: auto; margin: 0 auto; border-radius: 0; padding: 1em; } } 2 | Example Domain 3 | This domain is established to be used for illustrative examples in documents. You may use this domain in examples without prior coordination or asking for permission. 4 | 5 | More information... 6 | 7 | -------------------------------------------------------------------------------- /.policy.yml: -------------------------------------------------------------------------------- 1 | policy: 2 | approval: 3 | - or: 4 | - deploy updates 5 | - submodule updates 6 | - anointed maintainers say yes 7 | 8 | approval_rules: 9 | - name: anointed maintainers say yes 10 | options: 11 | allow_contributor: true 12 | invalidate_on_push: true 13 | requires: 14 | count: 1 15 | teams: 16 | - "pewresearch/pewtils-maintainers" 17 | write_collaborators: true 18 | 19 | - name: deploy updates 20 | options: 21 | invalidate_on_push: true 22 | if: 23 | only_changed_files: 24 | paths: 25 | - '^deploy/.*' 26 | 27 | - name: submodule updates 28 | options: 29 | invalidate_on_push: true 30 | if: 31 | only_changed_files: 32 | paths: 33 | - '^src/.*' 34 | -------------------------------------------------------------------------------- /.github/workflows/build-docs.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | paths: 6 | - .github/workflows/build-docs.yaml 7 | - Makefile 8 | - docs_source/** 9 | 10 | name: build-docs 11 | 12 | jobs: 13 | build-docs: 14 | runs-on: pewtils-runner 15 | name: build docs 16 | permissions: 17 | contents: write 18 | env: 19 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - name: Install python dependencies 25 | run : | 26 | while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt 27 | pip install -r requirements.txt 28 | 29 | - name: Build docs html 30 | run: | 31 | if [[ "${{ github.repository }}" == "pewresearch/pewtils" ]]; then 32 | make github_docs 33 | else 34 | make s3_docs 35 | fi 36 | -------------------------------------------------------------------------------- /docs_source/http.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | HTTP Utilities 3 | ************** 4 | 5 | In this module, you'll find a variety of useful functions for working with web data. \ 6 | The :py:func:`pewtils.http.canonical_link` function is our best attempt at standardizing and cleaning a URL without \ 7 | losing any information, and the :py:func:`pewtils.http.strip_html` function is useful for attempting to extract text \ 8 | from raw HTML data with minimal fine-tuning. 9 | 10 | .. automodule :: pewtils.http 11 | :autosummary: 12 | :members: 13 | 14 | +++++++++++++++ 15 | Link Shorteners 16 | +++++++++++++++ 17 | 18 | List of link shorteners recognized by methods in this section. 19 | 20 | General Link Shorteners 21 | ^^^^^^^^^^^^^^^^^^^^^^^ 22 | 23 | A list of known :ref:`gen_link_shorteners`. 24 | 25 | Vanity Link Shorteners 26 | ^^^^^^^^^^^^^^^^^^^^^^^ 27 | 28 | A list of known URL shorteners for websites specific :ref:`vanity_link_shorteners` (primarily news websites). 29 | -------------------------------------------------------------------------------- /.github/workflows/unit-tests.yaml: -------------------------------------------------------------------------------- 1 | name: unit-tests 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | paths: 8 | - .github/workflows/unit-tests.yaml 9 | - Makefile 10 | - pewtils/** 11 | - requirements.txt 12 | - setup.py 13 | - tests/** 14 | 15 | jobs: 16 | unit-tests: 17 | name: unit-tests 18 | runs-on: pewtils-runner 19 | env: 20 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 21 | permissions: 22 | contents: read 23 | pull-requests: write 24 | 25 | steps: 26 | - uses: actions/checkout@v2 27 | 28 | - name: Install python dependencies 29 | run: | 30 | while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt 31 | pip install -r requirements.txt 32 | 33 | - name: Lint with flake8 34 | env: 35 | REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} 36 | run: | 37 | make python_lint_errors 38 | make github_lint_flake8 39 | 40 | - name: Run unit tests 41 | run: | 42 | make python_test 43 | -------------------------------------------------------------------------------- /docs_source/http_link_shorteners.rst: -------------------------------------------------------------------------------- 1 | .. _link_shorteners: 2 | 3 | *************** 4 | Link Shorteners 5 | *************** 6 | 7 | Lists of known link shorteners recognized by ``pewtils.http`` utility methods such \ 8 | as :py:func:`pewtils.http.canonical_link` and :py:func:`pewtils.http.extract_domain_from_url`. These lists were \ 9 | compiled from several collections of shortened links found in social media posts and news articles, so most of the \ 10 | shorteners belong to news outlets and large popular websites, especially those with political content. Since domains \ 11 | can be retired or may change ownership and get redirected to different websites over time, these lists may not \ 12 | be perfectly accurate. We will try to keep them updated as we become aware of changes, but if you notice any \ 13 | inaccuracies or wish to add to these lists, please consider making a pull request! 14 | 15 | .. _gen_link_shorteners: 16 | .. csv-table:: Generic Link Shorteners 17 | :file: ../pewtils/general_link_shorteners.csv 18 | :widths: 30 19 | :header-rows: 1 20 | 21 | .. _vanity_link_shorteners: 22 | .. csv-table:: Vanity Link Shorteners 23 | :file: ../pewtils/vanity_link_shorteners.csv 24 | :widths: 30, 30, 30 25 | :header-rows: 1 26 | -------------------------------------------------------------------------------- /docs_source/pewtils_core.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | Core Functions 3 | ************** 4 | 5 | The main Pewtils module contains a variety of generally useful functions that make our researchers \ 6 | lives easier. For those still working in Python 2.x, the :py:func:`pewtils.decode_text` function can help alleviate \ 7 | headaches related to text encodings. The :py:func:`pewtils.is_null` and :py:func:`pewtils.is_not_null` functions \ 8 | provide an easy way to deal with the wide variety of possible null values that exist in the Python (and broader \ 9 | research universe) by using a best-guess approach. When working with dictionaries or JSON records that need \ 10 | to be updated, :py:func:`pewtils.recursive_update` makes it easy to map one version of an object onto another. While \ 11 | we strive to write efficient code that can cover every possible use-case, there are certainly some \ 12 | edge cases that we haven't encountered, and other existing Python libraries may very well provide \ 13 | many of these same features. This collection simply consists of functions we find ourselves using \ 14 | again and again, and we hope that Pewtils may help expand your daily toolkit in some way as well. 15 | 16 | .. automodule :: pewtils.__init__ 17 | :autosummary: 18 | :members: 19 | -------------------------------------------------------------------------------- /pewtils/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | URL_REGEX = re.compile( 5 | r"((?:https?:\/\/(?:www\.)?)?[-a-zA-Z0-9@:%._\+~#=]{1,4096}\.[a-z]{2,6}\b(?:[-a-zA-Z0-9@:%_\+.~#?&//=]*))" 6 | ) 7 | """ 8 | A compiled regular expression for extracting (probably) valid URLs. 9 | """ 10 | 11 | DOMAIN_REGEX = re.compile( 12 | r"(?:http[s]?\:\/\/)?(?:www(?:s?)\.)?([\w\.\-]+)(?:[\\\/](?:.+))?" 13 | ) 14 | """ 15 | A compiled regular expression for extracting domains from URLs. Can be useful in a pinch but we recommend \ 16 | using the :py:func:`pewtils.http.extract_domain_from_url` instead. 17 | """ 18 | 19 | HTTP_REGEX = re.compile(r"^http(?:s)?\:\/\/") 20 | """ 21 | A compiled regular expression for finding HTTP/S prefixes. 22 | """ 23 | 24 | US_DOLLAR_REGEX = re.compile( 25 | r"(\$(?:[1-9][0-9]{0,2}(?:(?:\,[0-9]{3})+)?(?:\.[0-9]{1,2})?))\b" 26 | ) 27 | """ 28 | A compiled regular expression finding USD monetary amounts. 29 | """ 30 | 31 | TITLEWORD_REGEX = re.compile(r"\b([A-Z][a-z]+)\b") 32 | """ 33 | A compiled regular expression for finding basic title-cased words. 34 | """ 35 | 36 | NUMBER_REGEX = re.compile(r"\b([0-9]+)\b") 37 | """ 38 | A compiled regular expression for finding raw numbers. 39 | """ 40 | 41 | NONALPHA_REGEX = re.compile(r"[^\w]") 42 | """ 43 | A compiled regular expression for finding non-alphanumeric values. 44 | """ 45 | -------------------------------------------------------------------------------- /tests/files/example.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Example Domain 6 | 7 | 8 | 9 | 10 | 41 | 42 | 43 | 44 |
45 |

Example Domain

46 |

This domain is established to be used for illustrative examples in documents. You may use this 47 | domain in examples without prior coordination or asking for permission.

48 |

More information...

49 |
50 | 51 | 52 | -------------------------------------------------------------------------------- /.github/workflows/build-release.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | tags: 4 | - 'v*.*.*' 5 | 6 | name: build-release 7 | 8 | jobs: 9 | build-release: 10 | runs-on: pewtils-runner 11 | name: build release package 12 | env: 13 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 14 | permissions: 15 | contents: write 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | 20 | - name: Install python dependencies 21 | run : | 22 | while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt 23 | pip install -r requirements.txt 24 | 25 | - name: Lint with flake8 26 | run: | 27 | make python_lint_errors 28 | make python_lint_quality 29 | 30 | - name: Run unit tests 31 | run: | 32 | make python_test 33 | 34 | - name: Build Python package 35 | run: | 36 | make python_build 37 | 38 | - name: Upload to Nexus Repository 39 | run: | 40 | twine upload --non-interactive --repository-url '${{ secrets.PACKAGE_REPO_URL_PYTHON }}' --username '${{ secrets.PACKAGE_REPO_USER }}' --password '${{ secrets.PACKAGE_REPO_PASSWORD }}' dist/* 41 | 42 | - name: Publish Release 43 | uses: softprops/action-gh-release@v1 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 46 | with: 47 | prerelease: False 48 | body_path: CHANGELOG.md 49 | files: | 50 | *.whl 51 | -------------------------------------------------------------------------------- /.github/workflows/build-main.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | paths: 6 | - .github/workflows/build-main.yaml 7 | - Makefile 8 | - pewtils/** 9 | - requirements.txt 10 | - setup.py 11 | - tests/** 12 | 13 | name: build-main 14 | 15 | jobs: 16 | build-main: 17 | runs-on: pewtils-runner 18 | name: build main branch package 19 | env: 20 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 21 | permissions: 22 | contents: write 23 | 24 | steps: 25 | - uses: actions/checkout@v2 26 | 27 | - name: Bump the build version 28 | run: | 29 | git config --global user.name "Github Actions" 30 | git config --global user.email "<>" 31 | make bump 32 | 33 | - name: Install python dependencies 34 | run : | 35 | while read requirement; do mamba install "conda-forge::$requirement" || true; done < requirements.txt 36 | pip install -r requirements.txt 37 | 38 | - name: Lint with flake8 39 | run: | 40 | make python_lint_errors 41 | make python_lint_quality 42 | 43 | - name: Run unit tests 44 | run: | 45 | make python_test 46 | 47 | - name: Build Python package 48 | run: | 49 | make python_build 50 | 51 | - name: Upload to Package Repository 52 | run: | 53 | twine upload --non-interactive --repository-url '${{ secrets.PACKAGE_REPO_URL_PYTHON }}' --username '${{ secrets.PACKAGE_REPO_USER }}' --password '${{ secrets.PACKAGE_REPO_PASSWORD }}' dist/* 54 | 55 | - name: Sync new build commits 56 | run: | 57 | make sync_branch 58 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md") as README: 4 | readme = str(README.read()) 5 | 6 | with open("requirements.txt") as reqs: 7 | lines = reqs.read().split("\n") 8 | install_requires = [line for line in lines if line] 9 | 10 | setup( 11 | name="pewtils", 12 | version="1.1.6.dev1", 13 | description="General programming utilities from Pew Research Center", 14 | long_description=readme, 15 | long_description_content_type="text/markdown", 16 | url="https://github.com/pewresearch/pewtils", 17 | author="Pew Research Center", 18 | author_email="info@pewresearch.org", 19 | install_requires=install_requires, 20 | packages=find_packages(exclude=["contrib", "docs", "tests"]), 21 | include_package_data=True, 22 | keywords="utilities, link standardization, input, output", 23 | license="GPLv2+", 24 | classifiers=[ 25 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 26 | "Development Status :: 5 - Production/Stable", 27 | # "Development Status :: 6 - Mature", 28 | # "Development Status :: 7 - Inactive", 29 | "Environment :: Console", 30 | "Intended Audience :: Developers", 31 | "Intended Audience :: Education", 32 | "Intended Audience :: Information Technology", 33 | "Intended Audience :: Science/Research", 34 | "License :: OSI Approved :: GNU General Public License v2 or later (GPLv2+)", 35 | "Operating System :: OS Independent", 36 | "Programming Language :: Python :: 3.7", 37 | "Programming Language :: Python :: 3.8", 38 | "Programming Language :: Python :: 3.9", 39 | "Programming Language :: Python", 40 | "Topic :: Software Development :: Libraries :: Python Modules", 41 | "Topic :: Utilities", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /.github/runner.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: actions.summerwind.dev/v1alpha1 2 | kind: RunnerDeployment 3 | metadata: 4 | name: pewtils-dev-runner 5 | namespace: github-runners 6 | 7 | spec: 8 | replicas: 1 9 | template: 10 | spec: 11 | repository: pewresearch/pewtils_dev 12 | image: 458280294434.dkr.ecr.us-east-1.amazonaws.com/labs-actions-runner@sha256:41a92e6db53febef2db892cea45680d480dce6c8f576367b1245d57f017e7935 13 | imagePullPolicy: Always 14 | serviceAccountName: labs-runner 15 | labels: 16 | - pewtils-runner 17 | dockerEnabled: false 18 | dockerdContainerResources: 19 | limits: 20 | cpu: "4.0" 21 | memory: "8Gi" 22 | 23 | requests: 24 | cpu: "100m" 25 | memory: "2Gi" 26 | 27 | env: 28 | - name: AWS_ACCESS_KEY_ID 29 | valueFrom: 30 | secretKeyRef: 31 | name: labs-runner 32 | key: AWS_ACCESS_KEY_ID 33 | 34 | - name: AWS_SECRET_ACCESS_KEY 35 | valueFrom: 36 | secretKeyRef: 37 | name: labs-runner 38 | key: AWS_SECRET_ACCESS_KEY 39 | 40 | metadata: 41 | annotations: 42 | cluster-autoscaler.kubernetes.io/safe-to-evict: "true" 43 | 44 | --- 45 | 46 | apiVersion: actions.summerwind.dev/v1alpha1 47 | kind: RunnerDeployment 48 | metadata: 49 | name: pewtils-runner 50 | namespace: github-runners 51 | 52 | spec: 53 | replicas: 1 54 | template: 55 | spec: 56 | repository: pewresearch/pewtils 57 | image: 458280294434.dkr.ecr.us-east-1.amazonaws.com/labs-actions-runner@sha256:41a92e6db53febef2db892cea45680d480dce6c8f576367b1245d57f017e7935 58 | imagePullPolicy: Always 59 | serviceAccountName: labs-runner 60 | labels: 61 | - pewtils-runner 62 | dockerEnabled: false 63 | dockerdContainerResources: 64 | limits: 65 | cpu: "4.0" 66 | memory: "8Gi" 67 | 68 | requests: 69 | cpu: "100m" 70 | memory: "2Gi" 71 | 72 | metadata: 73 | annotations: 74 | cluster-autoscaler.kubernetes.io/safe-to-evict: "true" 75 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BRANCH := $(shell git rev-parse --symbolic-full-name --abbrev-ref HEAD) 2 | 3 | # by default, we'll bump the "build" part of the version, for non-releases 4 | PART = build 5 | 6 | # if the current version is a release and not a dev build, bump the patch part instead 7 | VERSION := $(shell grep -Po '(?<=current_version = )[\w\d\.]+' .bumpversion.cfg) 8 | ifeq (,$(findstring dev,$(VERSION))) 9 | ifeq ($(PART),build) 10 | PART = patch 11 | endif 12 | endif 13 | 14 | # Minimal makefile for Sphinx documentation 15 | 16 | SPHINXOPTS = 17 | SPHINXBUILD = sphinx-build 18 | SOURCEDIR = docs_source 19 | BUILDDIR = _build 20 | 21 | # Put it first so that "make" without argument is like "make help". 22 | help: 23 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | 25 | .PHONY: help Makefile 26 | 27 | docs: 28 | -rm -rf _build/ 29 | make html 30 | 31 | s3_docs: docs 32 | aws s3 sync --delete _build/html/ s3://docs.pewresearch.tech/pewtils/ 33 | 34 | github_docs: 35 | make html 36 | -mv _build/html /tmp/html 37 | -rm -rf _build 38 | -git branch -D docs 39 | git fetch --all 40 | git checkout docs 41 | -mv .git /tmp/.git 42 | -rm -rf * .* 43 | -mv /tmp/.git . 44 | cp -a /tmp/html/. . 45 | -rm -rf /tmp/html 46 | git add -A . 47 | git commit -m "latest docs" 48 | git push origin docs 49 | git checkout $(BRANCH) 50 | 51 | python_lint_errors: 52 | # stop the build if there are Python syntax errors or undefined names 53 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude=.git,__pycache__,build,dist 54 | 55 | python_lint_quality: 56 | flake8 . --exit-zero --statistics --count --show-source --max-line-length=127 --ignore=E201,E202,E501,E722,W503,W504 --exclude=.git,__pycache__,build,dist 57 | 58 | github_lint_flake8: 59 | flake8 . --max-line-length 127 --ignore=E201,E202,E501,E722,W503,W504 --exclude=.git,__pycache__,build,dist | reviewdog -reporter=github-pr-check -f=flake8 60 | 61 | python_test: 62 | python3 -m unittest tests 63 | 64 | python_build: 65 | python3 setup.py sdist bdist_wheel 66 | 67 | .ONESHELL: 68 | bump: 69 | git checkout $(BRANCH) 70 | git pull origin $(BRANCH) 71 | bumpversion --commit $(PART) 72 | 73 | .ONESHELL: 74 | sync_branch: 75 | git checkout $(BRANCH) 76 | git pull origin $(BRANCH) 77 | git push origin $(BRANCH) 78 | 79 | .ONESHELL: 80 | release: 81 | git checkout $(BRANCH) 82 | git pull origin $(BRANCH) 83 | bumpversion --commit $(PART) 84 | bumpversion --commit --tag release 85 | git push origin $(BRANCH) --follow-tags 86 | 87 | # Catch-all target: route all unknown targets to Sphinx using the new 88 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 89 | %: Makefile 90 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 91 | -------------------------------------------------------------------------------- /docs_source/index.rst: -------------------------------------------------------------------------------- 1 | Pewtils 2 | =================================================================== 3 | 4 | Pewtils is a package of useful programming utilities developed at the Pew Research Center \ 5 | over the years. Most of the functions in Pewtils can be found in the root module, while a \ 6 | handful of submodules contain more specialized utilities for working with files, web \ 7 | resources, and regular expressions. 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: Table of Contents: 12 | 13 | Core Functions 14 | HTTP Utilities 15 | I/O Tools 16 | Regex Patterns 17 | Examples 18 | 19 | Installation 20 | --------------- 21 | 22 | To install, you can use ``pip``: 23 | 24 | .. code-block:: bash 25 | 26 | pip install git+https://github.com/pewresearch/pewtils#egg=pewtils 27 | 28 | Or you can install from source: 29 | 30 | .. code-block:: bash 31 | 32 | git clone https://github.com/pewresearch/pewtils.git 33 | cd pewtils 34 | python setup.py install 35 | 36 | .. note:: 37 | This is a Python 3 package. Though it is compatible with Python 2, many of its dependencies are \ 38 | planning to drop support for earlier versions if they haven't already. We highly recommend \ 39 | you upgrade to Python 3. 40 | 41 | Installation Troubleshooting 42 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 43 | 44 | Using 64-bit Python 45 | """""""""""""""""""" 46 | 47 | Some of our libraries require the use of 64-bit Python. If you encounter errors during installation \ 48 | that are related to missing libraries, you may be using 32-bit Python. We recommend that you uninstall \ 49 | this version and switch to a 64-bit version instead. On Windows, these will be marked with ``x86-64``; you \ 50 | can find the latest 64-bit versions of Python `here `_. 51 | 52 | Installing ssdeep 53 | """""""""""""""""""""""""""" 54 | 55 | ssdeep is an optional dependency that can be used by the :py:func:`pewtils.get_hash` function in Pewtils. \ 56 | Installation instructions for various Linux distributions can be found in the library's \ 57 | `documentation `_. The ssdeep \ 58 | Python library is not currently compatible with Windows. \ 59 | Installing ssdeep on Mac OS may involve a few additional steps, detailed below: 60 | 61 | 1. Install Homebrew 62 | 63 | 2. Install xcode 64 | 65 | .. code-block:: bash 66 | 67 | xcode-select --install 68 | 69 | 3. Install system dependencies 70 | 71 | .. code-block:: bash 72 | 73 | brew install pkg-config libffi libtool automake 74 | ln -s /usr/local/bin/glibtoolize /usr/local/bin/libtoolize 75 | 76 | 4. Install ssdeep with an additional flag to build the required libraries 77 | 78 | .. code-block:: bash 79 | 80 | BUILD_LIB=1 pip install ssdeep 81 | 82 | 5. If step 4 fails, you may need to redirect your system to the new libraries by setting the following flags: 83 | 84 | .. code-block:: bash 85 | 86 | export LIBTOOL=`which glibtool` 87 | export LIBTOOLIZE=`which glibtoolize` 88 | 89 | Do this and try step 4 again. 90 | 91 | 6. Now you should be able to run the main installation process detailed above. 92 | 93 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Pewtils 2 | 3 | 4 | 5 | [repo]: https://github.com/pewresearch/pewtils 6 | [issues]: https://github.com/pewresearch/pewtils/issues 7 | [new_issue]: https://github.com/pewresearch/pewtils/issues/new 8 | [email]: info@pewresearch.org 9 | 10 | ## How you can contribute 11 | 12 | There are several ways you can contribute to this project. If you want to know more about why and how to contribute to open source projects like this one, see this [Open Source Guide](https://opensource.guide/how-to-contribute/). 13 | 14 | ### Share the love ❤️ 15 | 16 | Think **pewtils** is useful? Let others discover it, by telling them in person, via Twitter or a blog post. 17 | 18 | ### Ask a question ⁉️ 19 | 20 | Using **pewtils** and got stuck? Check out the [documentation](https://pewresearch.github.io/pewtils/). 21 | Still stuck? Post your question as an [issue on GitHub][new_issue]. While we cannot offer user support, we'll try to do our best to address it, as questions often lead to better documentation or the discovery of bugs. 22 | 23 | Want to ask a question in private? Contact the package maintainer by [email][email]. 24 | 25 | ### Propose an idea 💡 26 | 27 | Have an idea for a new **pewtils** feature? Take a look at the [issue list][issues] to see if it isn't included or suggested yet. If not, suggest your idea as an [issue on GitHub][new_issue]. While we can't promise to implement your idea, it helps to: 28 | 29 | * Explain in detail how it would work. 30 | * Keep the scope as narrow as possible. 31 | 32 | See below if you want to contribute code for your idea as well. 33 | 34 | ### Report a bug 🐛 35 | 36 | Using **pewtils** and discovered a bug? That's annoying! Don't let others have the same experience and report it as an [issue on GitHub][new_issue] so we can fix it. A good bug report makes it easier for us to do so, so please include: 37 | 38 | * Your operating system name and version (e.g. macOS 10.13.6). 39 | * Any details about your local setup that might be helpful in troubleshooting. 40 | * Detailed steps to reproduce the bug. 41 | 42 | ### Contribute code 📝 43 | 44 | Care to fix bugs or implement new functionality for **pewtils**? Awesome! 👏 Have a look at the [issue list][issues] and leave a comment on the things you want to work on. When making contributions, please follow the development guidelines below. 45 | 46 | #### Development guidelines 47 | 48 | We try to follow the [GitHub flow](https://guides.github.com/introduction/flow/) for development, and we use Python docstrings and [Sphinx](https://www.sphinx-doc.org/en/master/) to document all of our code. 49 | 50 | 1. Fork [this repo][repo] and clone it to your computer. To learn more about this process, see [this guide](https://guides.github.com/activities/forking/). 51 | 2. If you have forked and cloned the project before and it has been a while since you worked on it, [pull changes from the original repo](https://help.github.com/articles/merging-an-upstream-repository-into-your-fork/) to your clone by using `git pull upstream master`. 52 | 3. Make your changes: 53 | * Write your code. 54 | * Test your code (bonus points for adding unit tests). 55 | * Document your code (see function documentation above). 56 | 4. If you added unit tests, make sure everything works by running the `python -m unittest tests` command from the root directory of the repository. 57 | 5. If you added or updated documentation, build a fresh version of the docs by running the `make github` command from the root directory of the repository. 58 | 6. Commit and push your changes. 59 | 7. Submit a [pull request](https://guides.github.com/activities/forking/#making-a-pull-request). -------------------------------------------------------------------------------- /tests/regex.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class RegexTests(unittest.TestCase): 5 | 6 | """ 7 | To test, navigate to pewtils root folder and run `python -m unittest tests` 8 | """ 9 | 10 | def setUp(self): 11 | pass 12 | 13 | def test_url_regex(self): 14 | 15 | from pewtils.regex import URL_REGEX 16 | 17 | for val in [ 18 | "example.com", 19 | "www.example.com", 20 | "http://example.com", 21 | "https://example.com", 22 | "https://www.example.com", 23 | "example.com/test", 24 | "example.com/test?test=test", 25 | "http://example.com?test=test&test=test", 26 | "https://t.co/example", 27 | ]: 28 | result = URL_REGEX.findall("test {} test".format(val)) 29 | self.assertEqual(result[0], val) 30 | 31 | def test_domain_regex(self): 32 | from pewtils.regex import DOMAIN_REGEX 33 | 34 | for val in ["example.com", "http://example.com"]: 35 | result = DOMAIN_REGEX.findall(val) 36 | self.assertEqual(result[0], "example.com") 37 | for val in [ 38 | "test.example.com", 39 | "http://test.example.com", 40 | "https://www.test.example.com", 41 | "test.example.com/test", 42 | ]: 43 | result = DOMAIN_REGEX.findall(val) 44 | self.assertEqual(result[0], "test.example.com") 45 | 46 | def test_http_regex(self): 47 | 48 | from pewtils.regex import HTTP_REGEX 49 | 50 | for val in [ 51 | "http://example.com", 52 | "https://example.com", 53 | "https://www.example.com", 54 | "http://example.com?test=test&test=test", 55 | ]: 56 | result = HTTP_REGEX.match(val) 57 | self.assertIsNotNone(result) 58 | 59 | for val in [ 60 | "example.com", 61 | "www.example.com", 62 | "example.com/test", 63 | "example.com/test?test=test", 64 | ]: 65 | result = HTTP_REGEX.match(val) 66 | self.assertIsNone(result) 67 | 68 | def test_us_dollar_regex(self): 69 | from pewtils.regex import US_DOLLAR_REGEX 70 | 71 | for val in [ 72 | "$1.00", 73 | "$10", 74 | "$10,000", 75 | "$999,999", 76 | "$1,000,000,000", 77 | "$1,000,000,000.00", 78 | ]: 79 | result = US_DOLLAR_REGEX.findall(val) 80 | self.assertEqual(result[0], val) 81 | 82 | for val in ["$01,000", "$01", "$1a0,000", "$.00", "$01.00"]: 83 | result = US_DOLLAR_REGEX.findall(val) 84 | self.assertEqual(len(result), 0) 85 | 86 | def test_titleword_regex(self): 87 | from pewtils.regex import TITLEWORD_REGEX 88 | 89 | for val, expected in [ 90 | ("this is a Test", ["Test"]), 91 | ("testing One two three", ["One"]), 92 | ("testing One Two Three", ["One", "Two", "Three"]), 93 | ("testing One1 Two2 Three3", []), 94 | ("testing one two three", []), 95 | ]: 96 | result = TITLEWORD_REGEX.findall(val) 97 | self.assertEqual(result, expected) 98 | 99 | def test_number_regex(self): 100 | from pewtils.regex import NUMBER_REGEX 101 | 102 | for val, expected in [ 103 | ("one 2 three", ["2"]), 104 | ("1234", ["1234"]), 105 | (" 12 345 ", ["12", "345"]), 106 | ("one2three", []), 107 | ]: 108 | result = NUMBER_REGEX.findall(val) 109 | self.assertEqual(result, expected) 110 | 111 | def test_nonalpha_regex(self): 112 | from pewtils.regex import NONALPHA_REGEX 113 | 114 | for val, expected in [ 115 | ("abc$efg", ["$"]), 116 | ("one ^%& two", [" ", "^", "%", "&", " "]), 117 | ("one two three", [" ", " "]), 118 | ("1234", []), 119 | ]: 120 | result = NONALPHA_REGEX.findall(val) 121 | self.assertEqual(result, expected) 122 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pewtils 2 | 3 | Pewtils is a package of useful programming utilities developed at the Pew Research Center over the years. Most of the functions in Pewtils can be found in the root module, while a handful of submodules contain more specialized utilities for working with files, web resources, and regular expressions. 4 | 5 | ## Installation 6 | 7 | To install, you can use `pip`: 8 | 9 | pip install git+https://github.com/pewresearch/pewtils#egg=pewtils 10 | 11 | Or you can install from source: 12 | 13 | git clone https://github.com/pewresearch/pewtils.git 14 | cd pewtils 15 | python setup.py install 16 | 17 | ### Installation Troubleshooting 18 | 19 | #### Using 64-bit Python 20 | 21 | Some of our libraries require the use of 64-bit Python. If you encounter errors during installation that are related to missing libraries, you may be using 32-bit Python. We recommend that you uninstall this version and switch to a 64-bit version instead. On Windows, these will be marked with `x86-64`; you can find the latest 64-bit versions of Python [here](https://www.python.org/downloads). 22 | 23 | #### Installing ssdeep 24 | 25 | ssdeep is an optional dependency that can be used by the `get_hash` function in Pewtils. Installation instructions for various Linux distributions can be found in the library's [documentation](https://python-ssdeep.readthedocs.io/en/latest/installation.html). The ssdeep Python library is not currently compatible with Windows. Installing ssdeep on Mac OS may involve a few additional steps, detailed below: 26 | 27 | 1. Install Homebrew 28 | 2. Install xcode 29 | ``` 30 | xcode-select --install 31 | ``` 32 | 3. Install system dependencies 33 | ``` 34 | brew install pkg-config libffi libtool automake 35 | ln -s /usr/local/bin/glibtoolize /usr/local/bin/libtoolize 36 | ``` 37 | 4. Install ssdeep with an additional flag to build the required libraries 38 | ``` 39 | BUILD_LIB=1 pip install ssdeep 40 | ``` 41 | 5. If step 4 fails, you may need to redirect your system to the new libraries by setting the following flags: 42 | ``` 43 | export LIBTOOL=`which glibtool` 44 | export LIBTOOLIZE=`which glibtoolize` 45 | ``` 46 | Do this and try step 4 again. 47 | 6. Now you should be able to run the main installation process detailed above. 48 | 49 | ## Documentation 50 | 51 | Please refer to the [official documentation](https://pewresearch.github.io/pewtils/) for information on how to use this package. 52 | 53 | ## Use Policy 54 | 55 | In addition to the [license](https://github.com/pewresearch/pewtils/blob/master/LICENSE), Users must abide by the following conditions: 56 | 57 | - User may not use the Center's logo 58 | - User may not use the Center's name in any advertising, marketing or promotional materials. 59 | - User may not use the licensed materials in any manner that implies, suggests, or could otherwise be perceived as attributing a particular policy or lobbying objective or opinion to the Center, or as a Center endorsement of a cause, candidate, issue, party, product, business, organization, religion or viewpoint. 60 | 61 | ## Issues and Pull Requests 62 | 63 | This code is provided as-is for use in your own projects. You are free to submit issues and pull requests with any questions or suggestions you may have. We will do our best to respond within a 30-day time period. 64 | 65 | ## Recommended Package Citation 66 | 67 | Pew Research Center, 2020, "pewtils" Available at: github.com/pewresearch/pewtils 68 | 69 | ## Acknowledgements 70 | 71 | The following authors contributed to this repository: 72 | 73 | - Patrick van Kessel 74 | - Regina Widjaya 75 | - Skye Toor 76 | - Emma Remy 77 | - Onyi Lam 78 | - Brian Broderick 79 | - Galen Stocking 80 | - Dennis Quinn 81 | 82 | ## About Pew Research Center 83 | 84 | Pew Research Center is a nonpartisan fact tank that informs the public about the issues, attitudes and trends shaping the world. It does not take policy positions. The Center conducts public opinion polling, demographic research, content analysis and other data-driven social science research. It studies U.S. politics and policy; journalism and media; internet, science and technology; religion and public life; Hispanic trends; global attitudes and trends; and U.S. social and demographic trends. All of the Center's reports are available at [www.pewresearch.org](http://www.pewresearch.org). Pew Research Center is a subsidiary of The Pew Charitable Trusts, its primary funder. 85 | 86 | ## Contact 87 | 88 | For all inquiries, please email info@pewresearch.org. Please be sure to specify your deadline, and we will get back to you as soon as possible. This email account is monitored regularly by Pew Research Center Communications staff. 89 | 90 | -------------------------------------------------------------------------------- /docs_source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os, sys 16 | 17 | sys.path.insert(0, os.path.abspath("..")) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "pewtils" 23 | copyright = "2020, Pew Research Center" 24 | author = "Pew Research Center" 25 | 26 | # The short X.Y version 27 | version = "" 28 | # The full version, including alpha/beta/rc tags 29 | release = "1.1.6.dev1" 30 | 31 | 32 | # -- General configuration --------------------------------------------------- 33 | 34 | # If your documentation needs a minimal Sphinx version, state it here. 35 | # 36 | # needs_sphinx = '1.0' 37 | 38 | # Add any Sphinx extension module names here, as strings. They can be 39 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 40 | # ones. 41 | extensions = [ 42 | "sphinx.ext.autodoc", 43 | "sphinx.ext.intersphinx", 44 | "sphinx.ext.coverage", 45 | "sphinx.ext.mathjax", 46 | "sphinx.ext.ifconfig", 47 | "sphinx.ext.viewcode", 48 | "sphinx.ext.githubpages", 49 | "autodocsumm", 50 | ] 51 | 52 | # Add any paths that contain templates here, relative to this directory. 53 | templates_path = ["_templates"] 54 | 55 | # The suffix(es) of source filenames. 56 | # You can specify multiple suffix as a list of string: 57 | # 58 | # source_suffix = ['.rst', '.md'] 59 | source_suffix = ".rst" 60 | 61 | # The master toctree document. 62 | master_doc = "index" 63 | 64 | # The language for content autogenerated by Sphinx. Refer to documentation 65 | # for a list of supported languages. 66 | # 67 | # This is also used if you do content translation via gettext catalogs. 68 | # Usually you set "language" from the command line for these cases. 69 | language = None 70 | 71 | # List of patterns, relative to source directory, that match files and 72 | # directories to ignore when looking for source files. 73 | # This pattern also affects html_static_path and html_extra_path. 74 | exclude_patterns = [] 75 | 76 | # The name of the Pygments (syntax highlighting) style to use. 77 | pygments_style = None 78 | 79 | # Prevent autodoc to sort document members alphabetically 80 | autodoc_member_order = "bysource" 81 | 82 | # If true, the current module name will be prepended to all description 83 | # unit titles (such as .. function::). 84 | add_module_names = False 85 | 86 | # -- Options for HTML output ------------------------------------------------- 87 | 88 | # The theme to use for HTML and HTML Help pages. See the documentation for 89 | # a list of builtin themes. 90 | # 91 | html_theme = "sphinx_rtd_theme" 92 | 93 | # Theme options are theme-specific and customize the look and feel of a theme 94 | # further. For a list of options available for each theme, see the 95 | # documentation. 96 | # 97 | html_theme_options = {"navigation_depth": 3} 98 | 99 | # Add any paths that contain custom static files (such as style sheets) here, 100 | # relative to this directory. They are copied after the builtin static files, 101 | # so a file named "default.css" will overwrite the builtin "default.css". 102 | html_static_path = ["_static"] 103 | html_context = { 104 | "css_files": ["_static/theme_overrides.css"] # override wide tables in RTD theme 105 | } 106 | 107 | # Custom sidebar templates, must be a dictionary that maps document names 108 | # to template names. 109 | # 110 | # The default sidebars (for documents that don't match any pattern) are 111 | # defined by theme itself. Builtin themes are using these templates by 112 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 113 | # 'searchbox.html']``. 114 | # 115 | # html_sidebars = {} 116 | 117 | 118 | # -- Options for HTMLHelp output --------------------------------------------- 119 | 120 | # Output file base name for HTML help builder. 121 | htmlhelp_basename = "pewtilsdoc" 122 | 123 | 124 | # -- Options for LaTeX output ------------------------------------------------ 125 | 126 | latex_elements = { 127 | # The paper size ('letterpaper' or 'a4paper'). 128 | # 129 | # 'papersize': 'letterpaper', 130 | # The font size ('10pt', '11pt' or '12pt'). 131 | # 132 | # 'pointsize': '10pt', 133 | # Additional stuff for the LaTeX preamble. 134 | # 135 | # 'preamble': '', 136 | # Latex figure (float) alignment 137 | # 138 | # 'figure_align': 'htbp', 139 | } 140 | 141 | # Grouping the document tree into LaTeX files. List of tuples 142 | # (source start file, target name, title, 143 | # author, documentclass [howto, manual, or own class]). 144 | latex_documents = [ 145 | ( 146 | master_doc, 147 | "pewtils.tex", 148 | "pewtils Documentation", 149 | "pew research center", 150 | "manual", 151 | ) 152 | ] 153 | 154 | 155 | # -- Options for manual page output ------------------------------------------ 156 | 157 | # One entry per manual page. List of tuples 158 | # (source start file, name, description, authors, manual section). 159 | man_pages = [(master_doc, "pewtils", "pewtils Documentation", [author], 1)] 160 | 161 | 162 | # -- Options for Texinfo output ---------------------------------------------- 163 | 164 | # Grouping the document tree into Texinfo files. List of tuples 165 | # (source start file, target name, title, author, 166 | # dir menu entry, description, category) 167 | texinfo_documents = [ 168 | ( 169 | master_doc, 170 | "pewtils", 171 | "pewtils Documentation", 172 | author, 173 | "pewtils", 174 | "One line description of project.", 175 | "Miscellaneous", 176 | ) 177 | ] 178 | 179 | 180 | # -- Options for Epub output ------------------------------------------------- 181 | 182 | # Bibliographic Dublin Core info. 183 | epub_title = project 184 | 185 | # The unique identifier of the text. This can be a ISBN number 186 | # or the project homepage. 187 | # 188 | # epub_identifier = '' 189 | 190 | # A unique identification for the text. 191 | # 192 | # epub_uid = '' 193 | 194 | # A list of files that should not be packed into the epub file. 195 | epub_exclude_files = ["search.html"] 196 | 197 | 198 | # -- Extension configuration ------------------------------------------------- 199 | 200 | 201 | def setup(app): 202 | app.add_css_file("theme_overrides.css") 203 | -------------------------------------------------------------------------------- /pewtils/vanity_link_shorteners.csv: -------------------------------------------------------------------------------- 1 | shortener,expanded,historical 2 | 12ne.ws,12news.com,0 3 | 2wsb.tv,wsbtv.com,0 4 | ab.co,abc.net.au,0 5 | abcn.ws,abcnews.com,1 6 | actb.lu,actblue.com,1 7 | aje.io,aljazeera.com,0 8 | ampr.gs,americanprogress.org,0 9 | amzn.com,amazon.com,0 10 | amzn.to,amazon.com,0 11 | apne.ws,apnews.com,0 12 | armytim.es,armytimes.com,0 13 | atxne.ws,statesman.com,0 14 | ayre.to,calvinayre.com,1 15 | azc.cc,azcentral.com,0 16 | bayareane.ws,eastbaytimes.com,0 17 | bbc.in,bbc.co.uk,0 18 | bcove.me,bcove.me,0 19 | bernie.to,berniesanders.com,0 20 | bizj.us,bizjournals.com,0 21 | ble.ac,bleacherreport.com,0 22 | bloom.bg,bloomberg.com,0 23 | bloombg.org,bloomberg.org,0 24 | bos.gl,w.bos.gl,0 25 | brook.gs,brookings.edu,0 26 | bsun.md,baltimoresun.com,0 27 | buswk.co,businessweek.com,1 28 | bv.ms,bloomberg.com,0 29 | bzfd.it,buzzfeed.com,0 30 | c-spanvideo.org,c-span.org,0 31 | cbsloc.al,cbslocal.com,0 32 | cbsn.ws,cbsnews.com,0 33 | chn.ge,change.org,0 34 | chng.it,change.org,0 35 | cjky.it,courier-journal.com,1 36 | cmplx.co,complex.com,0 37 | cnb.cx,cnbc.com,0 38 | cnet.co,cnet.com,0 39 | cnn.it,cnn.com,0 40 | cnnmon.ie,cnn.com,0 41 | comsen.se,commonsensemedia.org,0 42 | conta.cc,constantcontact.com,0 43 | cour.at,courant.com,1 44 | cs.pn,c-span.org,1 45 | csmo.us,cosmopolitan.com,0 46 | ctrylv.co,countryliving.com,0 47 | d-news.co,dallasnews.com,1 48 | dai.ly,dailymotion.com,0 49 | dailym.ai,dailymail.co.uk,0 50 | dailysign.al,dailysignal.com,0 51 | dbtg.tv,bundestag.de,1 52 | de.gov,delaware.gov,0 53 | delonline.us,delawareonline.com,0 54 | detne.ws,detroitnews.com,0 55 | dlsh.it,delish.com,0 56 | dmreg.co,desmoinesregister.com,0 57 | dot.gov,transportation.gov,1 58 | dpo.st,denverpost.com,0 59 | econ.st,economist.com,0 60 | ellemag.co,elle.com,0 61 | engt.co,engadget.com,0 62 | entm.ag,entrepreneur.com,0 63 | es.pn,espn.com,0 64 | esqr.co,esquire.com,0 65 | ewar.ren,elizabethwarren.com,1 66 | f-st.co,fastcompany.com,0 67 | fanda.co,fandango.com,1 68 | fb.com,facebook.com,0 69 | fb.me,facebook.com,0 70 | fdrl.st,fdrl.st,0 71 | flic.kr,flic.kr,0 72 | fmeq.co,fmeq.co,0 73 | fpa.ac,foodpolicyaction.org,0 74 | fxn.ws,foxnews.com,0 75 | g.co,google.com,1 76 | gizmo.do,gizmodo.com,0 77 | glblctzn.me,globalcitizen.org,0 78 | glo.bo,globo.com,0 79 | gma.abc,goodmorningamerica.com,0 80 | goldenisles.news,thebrunswicknews.com,0 81 | gph.is,giphy.com,0 82 | grnol.co,greenvilleonline.com,0 83 | harlem.in,harlemunited.org,1 84 | hbaz.co,harpersbazaar.com,0 85 | herit.ag,heritage.org,0 86 | hill.cm,thehill.com,0 87 | histv.co,history.com,0 88 | hrc.io,hillaryclinton.com,1 89 | hrld.us,miamiherald.com,1 90 | hsbu.us,housebeautiful.com,0 91 | hucka.be,mikehuckabee.com,1 92 | huff.lv,huffingtonpost.com,1 93 | huff.to,huffingtonpost.com,0 94 | huffp.st,huffingtonpost.com,0 95 | huffpost.com,huffingtonpost.com,0 96 | hulu.tv,hulu.com,0 97 | icont.ac,icont.ac,0 98 | ift.tt,ifttt.com,0 99 | il.gov,illinois.gov,0 100 | ind.pn,independent.co.uk,0 101 | indy.st,indystar.com,0 102 | injo.com,ijr.com,1 103 | instagr.am,instagram.com,0 104 | interc.pt,theintercept.com,0 105 | itun.es,itunes.com,0 106 | jrnl.ie,thejournal.ie,0 107 | jwatch.us,judicialwatch.org,1 108 | kpbs.us,kpbs.org,0 109 | kstp.mn,kstp.com,1 110 | ky.gov,kentucky.gov,0 111 | l-bc.co,lbc.co.uk,0 112 | lat.ms,latimes.com,0 113 | linkd.in,linkedin.com,0 114 | lnkd.in,linkedin.com,0 115 | lp.ca,lapresse.ca,0 116 | m.me,messenger.com,0 117 | ma.us,state.ma.us,1 118 | mailchi.mp,mailchimp.com,0 119 | mapq.st,mapquest.com,0 120 | marinetim.es,marinecorpstimes.com,0 121 | md.us,state.md.us,1 122 | meetu.ps,meetup.com,0 123 | mn.us,state.mn.us,1 124 | mol.im,dailymail.co.uk,0 125 | mrie.cl,marieclaire.com,0 126 | mt.gov,montana.gov,0 127 | mycj.co,mycentraljersey.com,0 128 | n.pr,npr.org,0 129 | natl.io,nationalreview.com,1 130 | natl.io,nationalreview.com,1 131 | natl.re,nationalreview.com,1 132 | navtim.es,navytimes.com,0 133 | nbc4i.co,nbc4i.com,0 134 | nbcbay.com,nbcbayarea.com,0 135 | nbcchi.com,nbcchicago.com,0 136 | nbcct.co,nbcconnecticut.com,0 137 | nbcnews.to,nbcnews.com,0 138 | nc1.tv,newscenter1.tv,0 139 | ne.gov,nebraska.gov,0 140 | newspr.es,news-press.com,0 141 | nj-ne.ws,nj.com,0 142 | njersy.co,northjersey.com,0 143 | nm.us,state.nm.us,1 144 | nwk.ee,europe.newsweek.com,1 145 | nws.mx,newsmax.com,1 146 | nwsdy.li,newsday.com,0 147 | ny.us,state.ny.us,1 148 | nydn.us,nydailynews.com,0 149 | nyer.cm,newyorker.com,1 150 | nyp.st,nypost.com,0 151 | nyti.ms,nytimes.com,0 152 | ofa.bo,ofa.us,1 153 | oh.us,state.oh.us,1 154 | ohne.ws,newarkadvocate.com,0 155 | on.fb.me,facebook.com,0 156 | onforb.es,forbes.com,0 157 | p4a.us,peteforamerica.com,1 158 | pa.us,state.pa.us,1 159 | pbpo.st,palmbeachpost.com,1 160 | pdora.co,pandora.com,0 161 | peoplem.ag,people.com,0 162 | pew.org,pewtrusts.org,0 163 | pewrsr.ch,pewresearch.org,0 164 | politi.co,politico.com,0 165 | prn.to,prnewswire.com,0 166 | propub.li,propublica.org,0 167 | ptrtvoic.es,patriotvoices.com,1 168 | r29.co,refinery29.com,0 169 | read.bi,businessinsider.com,0 170 | redd.it,reddit.com,0 171 | reut.rs,reuters.com,0 172 | rlm.ag,magnetmail.net,0 173 | rol.st,rollingstone.com,0 174 | roll.cl,cqrollcall.com,1 175 | rub.io,marcorubio.com,1 176 | sacb.ee,sacbee.com,1 177 | sc.mp,scmp.com,0 178 | scne.ws,thestate.com,1 179 | sen.gov,senate.gov,0 180 | sfex.news,sfexaminer.com,0 181 | slate.me,slate.com,0 182 | spkrryan.us,speaker.gov,1 183 | spon.de,spiegel.de,0 184 | spoti.fi,spotify.com,0 185 | st.news,seattletimes.com,1 186 | stjr.nl,statesmanjournal.com,0 187 | strib.mn,startribune.com,0 188 | tannos.mx,yarithtannos.com,0 189 | tgam.ca,theglobeandmail.com,0 190 | theatln.tc,theatlantic.com,0 191 | thebea.st,thedailybeast.com,0 192 | thegaz.co,thegazette.com,0 193 | thkpr.gs,thinkprogress.org,1 194 | thr.cm,hollywoodreporter.com,0 195 | thr.cm,hollywoodreporter.com,1 196 | ti.me,time.com,0 197 | tl.gd,twitlonger.com,0 198 | tlmdo.co,telemundo.com,0 199 | tmz.me,tmz.com,0 200 | tnne.ws,tennessean.com,1 201 | tnw.to,thenextweb.com,0 202 | tonyr.co,tonyrobbins.com,1 203 | trib.in,chicagotribune.com,1 204 | tun.in,tunein.com,0 205 | tusconne.ws,kold.com,1 206 | twimg.com,twitter.com,0 207 | twnctry.co,towndandcountrymag.com,0 208 | tws.io,weeklystandard.com,1 209 | txnne.ws,thetexan.news,0 210 | txpo.li,texaspolicy.com,1 211 | u.pw,upworthy.com,0 212 | uni.vi,univision.com,1 213 | usat.ly,usatoday.com,0 214 | usg.lc,usglc.org,1 215 | usm.ag,usmagazine.com,0 216 | virg.in,virgin.com,0 217 | vntyfr.com,vanityfair.com,0 218 | vogue.cm,vogue.com,0 219 | vpr.net,vpr.org,0 220 | wapo.st,washingtonpost.com,1 221 | washex.am,washingtonexaminer.com,0 222 | wb.md,webmd.com,0 223 | wbur.fm,wbur.orb,0 224 | wdtn.tv,wdtn.com,0 225 | wef.ch,weforum.org,0 226 | wh.gov,whitehouse.gov,0 227 | wink.news,winknews.com,0 228 | wpo.st,washingtonpost.com,1 229 | wrd.cm,wired.com,0 230 | wtim.es,washingtontimes.com,0 231 | wtr.ie,water.ie,0 232 | wtrne.ws,timesrecordnews.com,0 233 | wwrld.us,wenatcheeworld.com,0 234 | wxch.nl,weather.com,0 235 | yhoo.it,yahoo.com,0 236 | youtu.be,youtube.com,0 237 | -------------------------------------------------------------------------------- /docs_source/examples.rst: -------------------------------------------------------------------------------- 1 | ************** 2 | Examples 3 | ************** 4 | 5 | Check for null values 6 | ----------------------------------------------------- 7 | 8 | You can use the :py:func:`pewtils.is_null` and :py:func:`pewtils.is_not_null` to quickly check for a \ 9 | variety of common null values. 10 | 11 | .. code-block:: python 12 | 13 | from pewtils import is_null 14 | from pewtils import is_not_null 15 | import numpy as np 16 | 17 | >>> is_null(None) 18 | True 19 | >>> is_null("None") 20 | True 21 | >>> is_null("nan") 22 | True 23 | >>> is_null("") 24 | True 25 | >>> is_null(" ") 26 | True 27 | >>> is_null("NaN") 28 | True 29 | >>> is_null("none") 30 | True 31 | >>> is_null("NONE") 32 | True 33 | >>> is_null("n/a") 34 | True 35 | >>> is_null("N/A") 36 | True 37 | >>> is_null(np.nan) 38 | True 39 | >>> is_null("-9", custom_nulls=["-9"]) 40 | True 41 | >>> is_null("Hello World") 42 | False 43 | >>> is_null(0.0) 44 | False 45 | 46 | Collapse documents into context-sensitive hashes 47 | ----------------------------------------------------- 48 | 49 | When working with large documents, you can use the :py:func:`pewtils.get_hash` function to convert \ 50 | them into a variety of different hashed representations. By default, this function uses SSDEEP, which \ 51 | produced context-sensitive hashes that can be useful for searching for similar documents. 52 | 53 | .. code-block:: python 54 | 55 | from pewtils import get_hash 56 | 57 | >>> doc1 = "This is a document." 58 | >>> doc2 = "This is a document. But this one is longer." 59 | >>> get_hash(doc1) 60 | '3:hMCE+RL:hu+t' 61 | >>> get_hash(doc2) 62 | '3:hMCE+RGreCQHCAb:hu+0rLkb' 63 | # Notice that both hashes start the same way, corresponding to their overlapping text. 64 | 65 | Flatten nested lists 66 | ----------------------------------------------------- 67 | 68 | Easily flatten lists of lists: 69 | 70 | .. code-block:: python 71 | 72 | from pewtils import flatten_list 73 | 74 | >>> nested_lists = [[1, 2, 3], [4, 5, 6]] 75 | >>> flatten_list(nested_lists) 76 | [1, 2, 3, 4, 5, 6] 77 | 78 | Recursively update dictionaries and object attributes 79 | ----------------------------------------------------- 80 | 81 | Map a dictionary or object onto another version of itself to update overlapping attributes: 82 | 83 | .. code-block:: python 84 | 85 | from pewtils import recursive_update 86 | 87 | class TestObject(object): 88 | def __init__(self, value): 89 | self.value = value 90 | self.dict = {"obj_key": "original"} 91 | def __repr__(self): 92 | return("TestObject(value='{}', dict={})".format(self.value, self.dict)) 93 | 94 | original = { 95 | "object": TestObject("original"), 96 | "key1": {"key2": "original"} 97 | } 98 | update = { 99 | "object": {"value": "updated", "dict": {"obj_key": "updated"}}, 100 | "key1": {"key3": "new"} 101 | } 102 | 103 | >>> recursive_update(original, update) 104 | {'object': TestObject(value='updated', dict={'obj_key': 'updated'}), 105 | 'key1': {'key2': 'original', 'key3': 'new'}} 106 | 107 | 108 | Efficiently map a function onto a Pandas Series 109 | ----------------------------------------------------- 110 | 111 | Avoid repeating database lookups or expensive computations when applying a function to a Pandas \ 112 | Series by using the :py:func:`pewtils.cached_series_mapper` function, which caches the results \ 113 | for each value in the series as it iterates. 114 | 115 | .. code-block:: python 116 | 117 | import pandas as pd 118 | from pewtils import cached_series_mapper 119 | 120 | values = ["value"]*10 121 | def my_function(x): 122 | print(x) 123 | return x 124 | 125 | df = pd.DataFrame(values, columns=['column']) 126 | >>> mapped = df['column'].map(my_function) 127 | value 128 | value 129 | value 130 | value 131 | value 132 | value 133 | value 134 | value 135 | value 136 | value 137 | >>> mapped = cached_series_mapper(df['column'], my_function) 138 | value 139 | 140 | Read and write data in a variety of formats 141 | ----------------------------------------------------- 142 | 143 | The :py:class:`pewtils.io.FileHandler` class lets you easily read and write files in a variety of \ 144 | formats with minimal code, and it has support for Amazon S3 too: 145 | 146 | .. code-block:: python 147 | 148 | from pewtils.io import FileHandler 149 | 150 | >>> h = FileHandler("./", use_s3=False) # current local folder 151 | >>> df = h.read("my_csv", format="csv") 152 | # Do something and save to Excel 153 | >>> h.write("my_new_csv", df, format="xlsx") 154 | 155 | >>> my_data = [{"key": "value"}] 156 | >>> h.write("my_data", my_data, format="json") 157 | 158 | >>> my_data = ["a", "python", "list"] 159 | >>> h.write("my_data", my_data, format="pkl") 160 | 161 | # To read/write to an S3 bucket, simply pass your credentials 162 | >>> h = FileHandler("/my_folder", use_s3=True, aws_access="12345", aws_secret="67890", bucket="my-bucket") 163 | # The FileHandler can also detect your tokens directly from your environment 164 | # Just set the environment variables AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and S3_BUCKET 165 | 166 | Quickly extract text from raw HTML 167 | ----------------------------------------------------- 168 | 169 | It's not always perfect, but the :py:func:`pewtils.http.strip_html` function can often be used to \ 170 | extract most of the valuable text data from a raw HTML documents - useful for quick exploratory \ 171 | analysis after scraping a bunch of webpages. 172 | 173 | .. code-block:: python 174 | 175 | from pewtils.http import strip_html 176 | 177 | >>> my_html = "Header textBody text" 178 | >>> strip_html(my_html) 179 | 'Header text\n\nBody text' 180 | 181 | Standardize URLs and extract domains 182 | ----------------------------------------------------- 183 | 184 | The :py:func:`pewtils.http.canonical_link` function is our best attempt at resolving URLs to their \ 185 | true form: it follows shortened URLs, removes unnecessary GET parameters, and tries to avoid returning \ 186 | incorrect 404 pages in favor of the most informative last-known version of a URL. Once links have been \ 187 | standardized, you can also use the :py:func:`pewtils.http.extract_domain_from_url` function to pull \ 188 | out domains and subdomains. 189 | 190 | .. code-block:: python 191 | 192 | from pewtils.http import canonical_link 193 | 194 | >>> canonical_link("https://pewrsr.ch/2lxB0EX?unnecessary_param=1") 195 | "https://www.pewresearch.org/interactives/how-does-a-computer-see-gender/" 196 | 197 | from pewtils.http import extract_domain_from_url 198 | 199 | >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=False) 200 | "bbc.co.uk" 201 | >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=True) 202 | "forums.bbc.co.uk" 203 | -------------------------------------------------------------------------------- /tests/http.py: -------------------------------------------------------------------------------- 1 | import unittest, re 2 | 3 | 4 | class HTTPTests(unittest.TestCase): 5 | """ 6 | To test, navigate to pewtils root folder and run `python -m unittest tests` 7 | """ 8 | 9 | def setUp(self): 10 | pass 11 | 12 | def test_hash_url(self): 13 | from pewtils.http import hash_url 14 | 15 | url = hash_url("http://www.example.com") 16 | self.assertEqual(url, "7c1767b30512b6003fd3c2e618a86522") 17 | url = hash_url("www.example.com") 18 | self.assertEqual(url, "7c1767b30512b6003fd3c2e618a86522") 19 | 20 | def test_strip_html(self): 21 | # example.html taken from example.com on 3/5/19 22 | from contextlib import closing 23 | 24 | with closing(open("tests/files/example.html", "r")) as input: 25 | html = input.read() 26 | from pewtils.http import strip_html 27 | 28 | stripped_html = strip_html(html, simple=False) 29 | stripped_simple_html = strip_html(html, simple=True) 30 | # with closing(open("tests/files/example_stripped.html", "w")) as output: 31 | # output.write(stripped_html) 32 | # with closing(open("tests/files/example_stripped_simple.html", "w")) as output: 33 | # output.write(stripped_simple_html) 34 | 35 | with closing(open("tests/files/example_stripped.html", "r")) as input: 36 | text = input.read() 37 | self.assertEqual(text, stripped_html) 38 | with closing(open("tests/files/example_stripped_simple.html", "r")) as input: 39 | text = input.read() 40 | self.assertEqual(text, stripped_simple_html) 41 | 42 | def test_canonical_link(self): 43 | 44 | from pewtils.http import canonical_link 45 | 46 | user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.3; rv:88.0) Gecko/20100101 Firefox/88.0" 47 | 48 | for original_url, canonical_url in [ 49 | ( 50 | "https://nbcnews.to/2Yc5JVz", 51 | "https://www.nbcnews.com/politics/congress/senate-vote-9-11-first-responders-bill-tuesday-n1032831?cid=sm_npd_nn_tw_ma", 52 | ), 53 | ( 54 | "https://www.google.com/maps/d/viewer?mid=zQ8Zk-5ey-Y8.kgD9Rxu8JCNQ&hl=en&usp=sharing", 55 | "https://www.google.com/maps/d/viewer?mid=1NQVHeBBcVAnz9JwX1frZxX1ZgjY", 56 | ), 57 | ( 58 | "https://pewrsr.ch/2kk3VvY", 59 | "https://www.pewresearch.org/internet/2019/09/05/more-than-half-of-u-s-adults-trust-law-enforcement-to-use-facial-recognition-responsibly/", 60 | ), 61 | ( 62 | "https://pewrsr.ch/2ly4LFE", 63 | "https://www.pewresearch.org/internet/2019/09/05/the-challenges-of-using-machine-learning-to-identify-gender-in-images/", 64 | ), 65 | ( 66 | "https://pewrsr.ch/2lxB0EX", 67 | "https://www.pewresearch.org/interactives/how-does-a-computer-see-gender/", 68 | ), 69 | ]: 70 | result = canonical_link(original_url, user_agent=user_agent, timeout=60) 71 | self.assertEqual(result, canonical_url) 72 | 73 | def test_trim_get_parameters(self): 74 | from pewtils.http import trim_get_parameters 75 | 76 | user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.3; rv:88.0) Gecko/20100101 Firefox/88.0" 77 | for original_url, trimmed_url in [ 78 | ("https://httpbin.org/status/200", "https://httpbin.org/status/200"), 79 | ( 80 | "https://httpbin.org/status/200?param=1", 81 | "https://httpbin.org/status/200", 82 | ), 83 | ]: 84 | trimmed = trim_get_parameters( 85 | original_url, user_agent=user_agent, timeout=30 86 | ) 87 | self.assertEqual(trimmed, trimmed_url) 88 | 89 | def test_link_shortener_map(self): 90 | 91 | import requests 92 | from six.moves.urllib import parse as urlparse 93 | from pewtils.http import ( 94 | GENERAL_LINK_SHORTENERS, 95 | VANITY_LINK_SHORTENERS, 96 | HISTORICAL_VANITY_LINK_SHORTENERS, 97 | trim_get_parameters, 98 | ) 99 | 100 | # These are domains that resolve properly but are alternatives to a preferred version 101 | IGNORE_DOMAINS = [ 102 | "ap.org", 103 | "cnet.co", 104 | "de.gov", 105 | "huffpost.com", 106 | "ky.gov", 107 | "mt.gov", 108 | "sen.gov", 109 | "twimg.com", 110 | ] 111 | 112 | user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 11.3; rv:88.0) Gecko/20100101 Firefox/88.0" 113 | self.session = requests.Session() 114 | self.session.headers.update({"User-Agent": user_agent}) 115 | for k, v in VANITY_LINK_SHORTENERS.items(): 116 | if ( 117 | k not in HISTORICAL_VANITY_LINK_SHORTENERS.keys() 118 | and k not in IGNORE_DOMAINS 119 | ): 120 | try: 121 | resp = self.session.head("http://{}".format(k), allow_redirects=True, timeout=10) 122 | 123 | except requests.exceptions.ConnectionError: 124 | print(f"Could not resolve short domain (may be historic): {k} (connection error)") 125 | resp = None 126 | 127 | if resp: 128 | resp_url = trim_get_parameters(resp.url, session=self.session, timeout=10).split("?")[0] 129 | 130 | if k in resp_url: 131 | print(f"Short domain resolved unexpectedly (may be historic): {k} (resolved to {resp_url} but expected {v})") 132 | 133 | else: 134 | resolved = re.match( 135 | "(www[0-9]?\.)?([^:]+)(:\d+$)?", 136 | urlparse.urlparse(resp.url).netloc, 137 | ).group(2).rstrip('/') 138 | resolved = VANITY_LINK_SHORTENERS.get(resolved, resolved) 139 | # Vanity domains are often purchased/managed through bit.ly or trib.al, and don't resolve 140 | # to their actual website unless paired with an actual page URL; so as long as they resolve 141 | # to what we expect, or a generic vanity URL like bit.ly, we'll assume everything's good 142 | self.assertTrue(resolved in GENERAL_LINK_SHORTENERS or v in resolved) 143 | 144 | self.session.close() 145 | 146 | def test_extract_domain_from_url(self): 147 | from pewtils.http import extract_domain_from_url 148 | 149 | for url, domain, include_subdomain, resolve in [ 150 | ("https://pewrsr.ch/2lxB0EX", "pewresearch.org", False, False), 151 | ("https://pewrsr.ch/2lxB0EX", "pewresearch.org", False, True), 152 | ("https://nbcnews.to/2Yc5JVz", "nbcnews.com", False, False), 153 | ("https://nbcnews.to/2Yc5JVz", "nbcnews.com", False, True), 154 | ("https://news.ycombinator.com", "ycombinator.com", False, False), 155 | ("https://news.ycombinator.com", "news.ycombinator.com", True, False), 156 | ("http://forums.bbc.co.uk", "forums.bbc.co.uk", True, False), 157 | ("http://forums.bbc.co.uk", "bbc.co.uk", False, False), 158 | ("http://www.worldbank.org.kg/", "worldbank.org.kg", True, False), 159 | ("http://forums.news.cnn.com/", "forums.news.cnn.com", True, False), 160 | ("http://forums.news.cnn.com/", "cnn.com", False, False), 161 | ]: 162 | extracted_domain = extract_domain_from_url( 163 | url, include_subdomain=include_subdomain, resolve_url=resolve 164 | ) 165 | self.assertEqual(extracted_domain, domain) 166 | 167 | def tearDown(self): 168 | if getattr(self, 'session', None) is not None: 169 | self.session.close() 170 | -------------------------------------------------------------------------------- /tests/io.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from contextlib import closing 4 | 5 | 6 | class IOTests(unittest.TestCase): 7 | """ 8 | To test, navigate to pewtils root folder and run `python -m unittest tests` 9 | """ 10 | 11 | def setUp(self): 12 | import pandas as pd 13 | 14 | self.test_df = pd.DataFrame( 15 | [{"test": 1}, {"test": 2}, {"test": 3}, {"test": 4}] 16 | ) 17 | self.test_json = {"test1": 1, "test2": 2, "test3": 3, "test4": 4} 18 | import json 19 | 20 | test_json = json.dumps(self.test_json) 21 | self.test_json = json.loads(test_json) 22 | 23 | def test_filehandler_iterate_path(self): 24 | from pewtils.io import FileHandler 25 | 26 | h = FileHandler("tests/files", use_s3=False) 27 | files = [] 28 | for file in h.iterate_path(): 29 | files.append(file) 30 | files = [ 31 | f 32 | for f in files 33 | if not f.endswith(".pyc") and f not in ["__pycache__", ".DS_Store"] 34 | ] 35 | self.assertEqual( 36 | sorted(files), 37 | sorted( 38 | [ 39 | "subfolder", 40 | "__init__.py", 41 | "example.html", 42 | "example_stripped_simple.html", 43 | "json.json", 44 | "example_stripped.html", 45 | "py.py", 46 | ] 47 | ), 48 | ) 49 | 50 | def test_filehandler_clear_folder(self): 51 | from pewtils.io import FileHandler 52 | 53 | h = FileHandler("tests/files/temp", use_s3=False) 54 | 55 | with closing(open("tests/files/temp/temp.txt", "wb")) as output: 56 | output.write(b"test") 57 | h.clear_folder() 58 | files = [] 59 | for file in h.iterate_path(): 60 | files.append(file) 61 | self.assertEqual(len(files), 0) 62 | os.rmdir("tests/files/temp") 63 | 64 | def test_clear_file(self): 65 | from pewtils.io import FileHandler 66 | 67 | h = FileHandler("tests/files/temp", use_s3=False) 68 | with closing(open("tests/files/temp/temp.txt", "wb")) as output: 69 | output.write(b"test") 70 | h.clear_file("temp", format="txt") 71 | files = [] 72 | for file in h.iterate_path(): 73 | files.append(file) 74 | self.assertNotIn("temp.txt", files) 75 | self.assertEqual(len(files), 0) 76 | os.rmdir("tests/files/temp") 77 | 78 | h = FileHandler("tests/files/temp", use_s3=False) 79 | key = h.get_key_hash("temp") 80 | with closing(open("tests/files/temp/{}.txt".format(key), "wb")) as output: 81 | output.write(b"test") 82 | h.clear_file("temp", format="txt", hash_key=True) 83 | files = [] 84 | for file in h.iterate_path(): 85 | files.append(file) 86 | self.assertNotIn("{}.txt".format(key), files) 87 | self.assertEqual(len(files), 0) 88 | os.rmdir("tests/files/temp") 89 | 90 | def test_filehandler_get_key_hash(self): 91 | from pewtils.io import FileHandler 92 | 93 | h = FileHandler("tests/files", use_s3=False) 94 | self.assertEqual( 95 | h.get_key_hash("temp"), 96 | "c51bf90ccb22befa316b7a561fe9d5fd9650180b14421fc6d71bcd57", 97 | ) 98 | self.assertEqual( 99 | h.get_key_hash({"key": "value"}), 100 | "37e13e1116c86a6e9f3f8926375c7cb977ca74d2d598572ced03cd09", 101 | ) 102 | 103 | def test_filehandler_get_key_hash_s3(self): 104 | from pewtils.io import FileHandler 105 | 106 | if os.environ.get("S3_BUCKET"): 107 | h = FileHandler("tests/files", use_s3=True) 108 | self.assertEqual( 109 | h.get_key_hash("temp"), 110 | "c51bf90ccb22befa316b7a561fe9d5fd9650180b14421fc6d71bcd57", 111 | ) 112 | self.assertEqual( 113 | h.get_key_hash({"key": "value"}), 114 | "37e13e1116c86a6e9f3f8926375c7cb977ca74d2d598572ced03cd09", 115 | ) 116 | 117 | def test_filehandler_read_write_pkl(self): 118 | from pewtils.io import FileHandler 119 | 120 | h = FileHandler("tests/files", use_s3=False) 121 | h.write("temp", self.test_df, format="pkl") 122 | read = h.read("temp", format="pkl") 123 | import os 124 | 125 | os.unlink("tests/files/temp.pkl") 126 | self.assertEqual(repr(self.test_df), repr(read)) 127 | 128 | def test_filehandler_read_write_pkl_s3(self): 129 | from pewtils.io import FileHandler 130 | 131 | if os.environ.get("S3_BUCKET"): 132 | h = FileHandler("tests/files", use_s3=True) 133 | h.write("temp", self.test_df, format="pkl") 134 | read = h.read("temp", format="pkl") 135 | self.assertEqual(repr(self.test_df), repr(read)) 136 | 137 | def test_filehandler_read_write_csv(self): 138 | from pewtils.io import FileHandler 139 | 140 | h = FileHandler("tests/files", use_s3=False) 141 | h.write("temp", self.test_df, format="csv") 142 | read = h.read("temp", format="csv") 143 | del read["Unnamed: 0"] 144 | import os 145 | 146 | os.unlink("tests/files/temp.csv") 147 | self.assertEqual(repr(self.test_df), repr(read)) 148 | 149 | def test_filehandler_read_write_csv_s3(self): 150 | from pewtils.io import FileHandler 151 | 152 | if os.environ.get("S3_BUCKET"): 153 | h = FileHandler("tests/files", use_s3=True) 154 | h.write("temp", self.test_df, format="csv") 155 | read = h.read("temp", format="csv") 156 | del read["Unnamed: 0"] 157 | self.assertEqual(repr(self.test_df), repr(read)) 158 | 159 | def test_filehandler_read_write_txt(self): 160 | from pewtils.io import FileHandler 161 | 162 | h = FileHandler("tests/files", use_s3=False) 163 | h.write("temp", "test", format="txt") 164 | read = h.read("temp", format="txt") 165 | import os 166 | 167 | os.unlink("tests/files/temp.txt") 168 | self.assertEqual(read, "test") 169 | 170 | def test_filehandler_read_write_txt_s3(self): 171 | from pewtils.io import FileHandler 172 | 173 | if os.environ.get("S3_BUCKET"): 174 | h = FileHandler("tests/files", use_s3=True) 175 | h.write("temp", "test", format="txt") 176 | read = h.read("temp", format="txt") 177 | self.assertEqual(read, "test") 178 | 179 | def test_filehandler_read_write_tab(self): 180 | from pewtils.io import FileHandler 181 | 182 | h = FileHandler("tests/files", use_s3=False) 183 | h.write("temp", self.test_df, format="tab") 184 | read = h.read("temp", format="tab") 185 | del read["Unnamed: 0"] 186 | import os 187 | 188 | os.unlink("tests/files/temp.tab") 189 | self.assertEqual(repr(self.test_df), repr(read)) 190 | 191 | def test_filehandler_read_write_tab_s3(self): 192 | from pewtils.io import FileHandler 193 | 194 | if os.environ.get("S3_BUCKET"): 195 | h = FileHandler("tests/files", use_s3=True) 196 | h.write("temp", self.test_df, format="tab") 197 | read = h.read("temp", format="tab") 198 | del read["Unnamed: 0"] 199 | self.assertEqual(repr(self.test_df), repr(read)) 200 | 201 | def test_filehandler_read_write_xlsx(self): 202 | from pewtils.io import FileHandler 203 | 204 | h = FileHandler("tests/files", use_s3=False) 205 | h.write("temp", self.test_df, format="xlsx") 206 | read = h.read("temp", format="xlsx") 207 | if "Unnamed: 0" in read.columns: 208 | del read["Unnamed: 0"] 209 | import os 210 | 211 | os.unlink("tests/files/temp.xlsx") 212 | self.assertEqual(repr(self.test_df), repr(read)) 213 | 214 | def test_filehandler_read_write_xlsx_s3(self): 215 | from pewtils.io import FileHandler 216 | 217 | if os.environ.get("S3_BUCKET"): 218 | h = FileHandler("tests/files", use_s3=True) 219 | h.write("temp", self.test_df, format="xlsx") 220 | read = h.read("temp", format="xlsx") 221 | if "Unnamed: 0" in read.columns: 222 | del read["Unnamed: 0"] 223 | self.assertEqual(repr(self.test_df), repr(read)) 224 | 225 | def test_filehandler_read_write_xls(self): 226 | from pewtils.io import FileHandler 227 | 228 | h = FileHandler("tests/files", use_s3=False) 229 | h.write("temp", self.test_df, format="xls") 230 | read = h.read("temp", format="xls") 231 | if "Unnamed: 0" in read.columns: 232 | del read["Unnamed: 0"] 233 | import os 234 | 235 | os.unlink("tests/files/temp.xls") 236 | self.assertEqual(repr(self.test_df), repr(read)) 237 | 238 | def test_filehandler_read_write_xl_s3(self): 239 | from pewtils.io import FileHandler 240 | 241 | if os.environ.get("S3_BUCKET"): 242 | h = FileHandler("tests/files", use_s3=True) 243 | h.write("temp", self.test_df, format="xls") 244 | read = h.read("temp", format="xls") 245 | if "Unnamed: 0" in read.columns: 246 | del read["Unnamed: 0"] 247 | self.assertEqual(repr(self.test_df), repr(read)) 248 | 249 | def test_filehandler_read_write_dta(self): 250 | from pewtils.io import FileHandler 251 | 252 | h = FileHandler("tests/files", use_s3=False) 253 | h.write("temp", self.test_df, format="dta") 254 | read = h.read("temp", format="dta") 255 | del read["index"] 256 | import os 257 | 258 | os.unlink("tests/files/temp.dta") 259 | self.assertEqual(repr(self.test_df), repr(read)) 260 | 261 | def test_filehandler_read_write_dta_s3(self): 262 | from pewtils.io import FileHandler 263 | 264 | if os.environ.get("S3_BUCKET"): 265 | h = FileHandler("tests/files", use_s3=True) 266 | h.write("temp", self.test_df, format="dta") 267 | read = h.read("temp", format="dta") 268 | del read["index"] 269 | self.assertEqual(repr(self.test_df), repr(read)) 270 | 271 | def test_filehandler_read_write_json(self): 272 | from pewtils.io import FileHandler 273 | 274 | h = FileHandler("tests/files", use_s3=False) 275 | h.write("temp", self.test_json, format="json") 276 | read = h.read("temp", format="json") 277 | import os 278 | 279 | os.unlink("tests/files/temp.json") 280 | self.assertEqual(repr(self.test_json), repr(dict(read))) 281 | 282 | def test_filehandler_read_write_json_s3(self): 283 | from pewtils.io import FileHandler 284 | 285 | if os.environ.get("S3_BUCKET"): 286 | h = FileHandler("tests/files", use_s3=True) 287 | h.write("temp", self.test_json, format="json") 288 | read = h.read("temp", format="json") 289 | self.assertEqual(repr(self.test_json), repr(dict(read))) 290 | 291 | def tearDown(self): 292 | 293 | import os 294 | 295 | try: 296 | os.unlink("tests/files/temp/temp.txt") 297 | except OSError: 298 | pass 299 | for format in ["pkl", "csv", "tab", "txt", "xlsx", "xls", "dta", "json"]: 300 | try: 301 | os.unlink("tests/files/temp.{}".format(format)) 302 | except OSError: 303 | pass 304 | try: 305 | os.rmdir("tests/files/temp") 306 | except OSError: 307 | pass 308 | 309 | from pewtils.io import FileHandler 310 | 311 | if os.environ.get("S3_BUCKET"): 312 | h = FileHandler("tests/files", use_s3=True) 313 | for file in h.iterate_path(): 314 | if "." in file: 315 | filename, format = file.split(".") 316 | h.clear_file(filename, format=format) 317 | -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | 4 | class BaseTests(unittest.TestCase): 5 | 6 | """ 7 | To test, navigate to pewtils root folder and run `python -m unittest tests`. 8 | To assess unit test coverage, run `coverage run -m unittest tests` and then `coverage report -m`. 9 | """ 10 | 11 | def setUp(self): 12 | pass 13 | 14 | def test_decode_text(self): 15 | class FakeObject(object): 16 | def __str__(self): 17 | return "str" 18 | 19 | def __repr__(self): 20 | return "repr" 21 | 22 | import datetime 23 | import numpy as np 24 | from pewtils import decode_text 25 | 26 | text = decode_text("one two three") 27 | self.assertEqual(text, "one two three") 28 | # below examples taken from unidecode documentation 29 | text = decode_text(u"ko\u017eu\u0161\u010dek") 30 | self.assertEqual(text, "kozuscek") 31 | text = decode_text(u"30 \U0001d5c4\U0001d5c6/\U0001d5c1") 32 | self.assertIn(text, ["30 km/h", "30 /"]) 33 | # Python 2.7 does not have support for UTF-16 so it will fail on the above 34 | text = decode_text(u"\u5317\u4EB0") 35 | self.assertEqual(text, "Bei Jing ") 36 | text = decode_text(datetime.date(2019, 1, 1)) 37 | self.assertEqual(text, "2019-01-01") 38 | text = decode_text(None) 39 | self.assertEqual(text, "") 40 | text = decode_text("") 41 | self.assertEqual(text, "") 42 | text = decode_text(np.nan) 43 | self.assertEqual(text, "") 44 | text = decode_text(FakeObject()) 45 | self.assertEqual(text, "str") 46 | 47 | def test_is_null(self): 48 | 49 | import numpy as np 50 | import pandas as pd 51 | from pewtils import is_null, is_not_null 52 | 53 | for val in [None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"]: 54 | self.assertTrue(is_null(val)) 55 | self.assertTrue(is_null(np.nan)) 56 | self.assertTrue(is_not_null(0.0)) 57 | self.assertTrue(is_null("-9", custom_nulls=["-9"])) 58 | self.assertTrue(is_null([], empty_lists_are_null=True)) 59 | self.assertFalse(is_null([], empty_lists_are_null=False)) 60 | self.assertTrue(is_null(pd.Series(dtype=np.float64), empty_lists_are_null=True)) 61 | self.assertFalse(is_null(pd.Series(dtype=np.float64), empty_lists_are_null=False)) 62 | self.assertTrue(is_null(pd.DataFrame(), empty_lists_are_null=True)) 63 | self.assertFalse(is_null(pd.DataFrame(), empty_lists_are_null=False)) 64 | 65 | def test_recursive_update(self): 66 | from pewtils import recursive_update 67 | 68 | class TestObject(object): 69 | def __init__(self, val): 70 | self.val = val 71 | self.val_dict = {"key": "value"} 72 | 73 | test_obj = TestObject("1") 74 | base = { 75 | "level1": {"level2": {"val2": "test2"}, "val1": "test1", "val2": test_obj} 76 | } 77 | update = { 78 | "level1": { 79 | "level2": {"val2": "test123456"}, 80 | "val1": "test123", 81 | "val2": {"val": "2", "val_dict": {"key": "new_value"}}, 82 | "val3": {"test": "test"}, 83 | } 84 | } 85 | result = recursive_update(base, update) 86 | self.assertEqual(result["level1"]["level2"]["val2"], "test123456") 87 | self.assertEqual(result["level1"]["val1"], "test123") 88 | self.assertEqual(result["level1"]["val2"].val, "2") 89 | self.assertEqual(result["level1"]["val2"].val_dict["key"], "new_value") 90 | self.assertEqual(result["level1"]["val3"]["test"], "test") 91 | 92 | def test_chunk_list(self): 93 | from pewtils import chunk_list 94 | 95 | test = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 96 | chunked = [c for c in chunk_list(test, 3)] 97 | self.assertEqual(len(chunked), 4) 98 | self.assertEqual(chunked[-1], [10]) 99 | 100 | def test_extract_json_from_folder(self): 101 | from pewtils import extract_json_from_folder 102 | 103 | results = extract_json_from_folder( 104 | "tests/files", include_subdirs=False, concat_subdir_names=False 105 | ) 106 | self.assertEqual(results, {"json": {u"test_val": 1}}) 107 | results = extract_json_from_folder( 108 | "tests/files", include_subdirs=True, concat_subdir_names=False 109 | ) 110 | self.assertEqual( 111 | results, 112 | {"json": {u"test_val": 1}, "subfolder": {"subfolder": {u"test_val": 2}}}, 113 | ) 114 | results = extract_json_from_folder( 115 | "tests/files", include_subdirs=True, concat_subdir_names=True 116 | ) 117 | self.assertEqual( 118 | results, {"json": {u"test_val": 1}, "subfolder_subfolder": {u"test_val": 2}} 119 | ) 120 | 121 | def test_extract_attributes_from_folder_modules(self): 122 | from pewtils import extract_attributes_from_folder_modules 123 | 124 | results = extract_attributes_from_folder_modules("tests/files", "test") 125 | self.assertEqual(results["py"](), "test1") 126 | results = extract_attributes_from_folder_modules( 127 | "tests/files", "test", include_subdirs=True 128 | ) 129 | self.assertEqual(results["py"](), "test1") 130 | self.assertEqual(results["subfolder"]["subfolder_py"](), "test2") 131 | results = extract_attributes_from_folder_modules( 132 | "tests/files", "test", include_subdirs=True, concat_subdir_names=True 133 | ) 134 | self.assertEqual(results["py"](), "test1") 135 | self.assertEqual(results["subfolder_subfolder_py"](), "test2") 136 | 137 | def test_zipcode_num_to_string(self): 138 | 139 | from pewtils import zipcode_num_to_string 140 | 141 | for val in [20002, 20002.0, "20002", "20002.0"]: 142 | zip = zipcode_num_to_string(val) 143 | self.assertEqual(zip, "20002") 144 | for val in ["abcde", "12", "99999", "200", "1.0", None]: 145 | zip = zipcode_num_to_string(val) 146 | self.assertIsNone(zip) 147 | 148 | def test_flatten_list(self): 149 | from pewtils import flatten_list 150 | 151 | results = flatten_list([[1, 2, 3], [4, 5, 6]]) 152 | self.assertEqual(results, [1, 2, 3, 4, 5, 6]) 153 | 154 | def test_get_hash(self): 155 | from pewtils import get_hash 156 | 157 | for text, method, expected_value in [ 158 | ( 159 | "test_string", 160 | "nilsimsa", 161 | "49c808104092202004009004800200084a0240a0c09040a1113a04a821210016", 162 | ), 163 | ("test_string", "md5", "3474851a3410906697ec77337df7aae4"), 164 | ("test_string", "ssdeep", "3:HI2:Hl"), 165 | ( 166 | u"\u5317\u4EB0", 167 | "nilsimsa", 168 | "0100000044110004290804002820001002844001200601000101002800394081", 169 | ), 170 | (u"\u5317\u4EB0", "md5", "3261ad50fccf7ced43d944bbfd2acb5c"), 171 | (u"\u5317\u4EB0", "ssdeep", "3:I2n:l"), 172 | ]: 173 | hash = get_hash(text, hash_function=method) 174 | self.assertEqual(hash, expected_value) 175 | 176 | def test_concat_text(self): 177 | from pewtils import concat_text 178 | 179 | result = concat_text( 180 | "one two three", u"ko\u017eu\u0161\u010dek", u"\u5317\u4EB0", None 181 | ) 182 | self.assertEqual(result, "one two three kozuscek Bei Jing ") 183 | 184 | def test_vector_concat_text(self): 185 | from pewtils import vector_concat_text 186 | 187 | result = vector_concat_text(["one", "two", "three"], ["a", "b", "c"]) 188 | self.assertEqual(result[0], "one a") 189 | self.assertEqual(result[1], "two b") 190 | self.assertEqual(result[2], "three c") 191 | 192 | def test_cached_series_mapper(self): 193 | import pandas as pd 194 | from pewtils import cached_series_mapper 195 | 196 | df = pd.DataFrame([{"test": 1}, {"test": 2}, {"test": 3}, {"test": 3}]) 197 | df["mapped"] = cached_series_mapper(df["test"], lambda x: str(float(x))) 198 | self.assertEqual(list(df["mapped"].values), ["1.0", "2.0", "3.0", "3.0"]) 199 | 200 | def test_multiprocess_group_apply(self): 201 | 202 | import pandas as pd 203 | from pewtils import multiprocess_group_apply 204 | 205 | df = pd.DataFrame([{"test": 1}, {"test": 2}, {"test": 3}, {"test": 3}]) 206 | df["group"] = [1, 1, 2, 2] 207 | 208 | for add, multiply, expected in [(1, 2, 6), (1, 3, 9), (2, 2, 8)]: 209 | result = multiprocess_group_apply( 210 | df.groupby("group"), _test_function_agg, add, multiply=multiply 211 | ) 212 | self.assertEqual(len(result), 2) 213 | self.assertEqual((result == expected).astype(int).sum(), 2) 214 | 215 | for add, multiply, expected in [ 216 | (1, 2, [4, 6, 8, 8]), 217 | (1, 3, [6, 9, 12, 12]), 218 | (2, 2, [6, 8, 10, 10]), 219 | ]: 220 | 221 | result = multiprocess_group_apply( 222 | df.groupby("group"), _test_function_map, add, multiply=multiply 223 | ) 224 | self.assertEqual(len(result), 4) 225 | self.assertEqual(list(result.values), expected) 226 | 227 | def test_scale_range(self): 228 | from pewtils import scale_range 229 | 230 | self.assertEqual(scale_range(10, 5, 25, 0, 10), 2.5) 231 | self.assertEqual(scale_range(5, 0, 10, 0, 20), 10.0) 232 | 233 | def test_scan_dictionary(self): 234 | from pewtils import scan_dictionary 235 | 236 | test_dict = {"one": {"two": {"three": "woot"}}} 237 | vals, paths = scan_dictionary(test_dict, "three") 238 | self.assertEqual(vals[0], "woot") 239 | self.assertEqual(paths[0], "one/two/three/") 240 | vals, paths = scan_dictionary(test_dict, "doesnt_exist") 241 | self.assertEqual(vals, []) 242 | self.assertEqual(vals, []) 243 | 244 | test_dict = { 245 | "one": { 246 | "two": {"three": "woot"}, 247 | "three": {"four": "five"}, 248 | "six": [{"three": "seven"}], 249 | } 250 | } 251 | vals, paths = scan_dictionary(test_dict, "three") 252 | self.assertEqual(len(vals), 3) 253 | self.assertEqual(len(paths), 3) 254 | self.assertIn("woot", vals) 255 | self.assertIn({"four": "five"}, vals) 256 | self.assertIn("seven", vals) 257 | self.assertIn("one/two/three/", paths) 258 | self.assertIn("one/three/", paths) 259 | self.assertIn("one/six/three/", paths) 260 | 261 | def test_new_random_number(self): 262 | from pewtils import new_random_number 263 | import numpy as np 264 | 265 | for attempt, minimum, maximum, avg in [ 266 | (1, 1, 2, 1), 267 | (1, 1, 10, 1), 268 | (2, 1, 10, 2), 269 | (3, 1, 10, 4), 270 | (4, 1, 10, 5), 271 | (5, 1, 10, 5), 272 | (1, 2, 2, 2), 273 | (1, 2, 10, 3), 274 | (2, 2, 10, 4), 275 | (3, 2, 10, 5), 276 | (4, 2, 10, 5), 277 | (5, 2, 10, 5), 278 | ]: 279 | attempts = [ 280 | new_random_number(attempt=attempt, minimum=minimum, maximum=maximum) 281 | for i in range(500) 282 | ] 283 | self.assertGreaterEqual(np.min(attempts), minimum) 284 | self.assertLessEqual(np.max(attempts), maximum) 285 | self.assertGreaterEqual(round(np.average(attempts)), avg) 286 | 287 | def test_timeout_wrapper(self): 288 | from pewtils import timeout_wrapper 289 | import time 290 | 291 | def test(sleep): 292 | try: 293 | with timeout_wrapper(2): 294 | time.sleep(sleep) 295 | return True 296 | except: 297 | return False 298 | 299 | self.assertFalse(test(3)) 300 | self.assertTrue(test(1)) 301 | 302 | def test_print_execution_time(self): 303 | 304 | import re 305 | import time 306 | from io import StringIO 307 | from pewtils import PrintExecutionTime 308 | 309 | temp = StringIO() 310 | with PrintExecutionTime(label="my function", stdout=temp): 311 | time.sleep(5) 312 | temp.seek(0) 313 | output = temp.getvalue() 314 | self.assertIsNotNone(re.match(r"my function: 5\.[0-9]+ seconds", output)) 315 | 316 | def tearDown(self): 317 | pass 318 | 319 | 320 | def _test_function_agg(grp, add, multiply=1): 321 | return (len(grp) + add) * multiply 322 | 323 | 324 | def _test_function_map(grp, add, multiply=1): 325 | return grp["test"].map(lambda x: (x + add) * multiply) 326 | -------------------------------------------------------------------------------- /pewtils/io.py: -------------------------------------------------------------------------------- 1 | from builtins import object 2 | from contextlib import closing 3 | from pewtils import is_not_null 4 | from scandir import scandir 5 | import boto3 6 | import datetime 7 | import hashlib 8 | import json 9 | import os 10 | import pandas as pd 11 | import pickle as pickle 12 | import time 13 | 14 | try: 15 | from io import StringIO, BytesIO 16 | 17 | except ImportError: 18 | from StringIO import StringIO as BytesIO 19 | from StringIO import StringIO 20 | 21 | 22 | class FileHandler(object): 23 | 24 | """ 25 | Read/write data files in a variety of formats, locally and in Amazon S3 buckets. 26 | 27 | :param path: A valid path to the folder in local or s3 directory where files will be written to or read from 28 | :type path: str 29 | :param use_s3: Whether the path is an S3 location or local location 30 | :type use_s3: bool 31 | :param bucket: The name of the S3 bucket, required if ``use_s3=True``; will also try to fetch from the environment \ 32 | as S3_BUCKET 33 | :type bucket: str 34 | 35 | .. note:: Typical rectangular data files (i.e. ``csv``, ``tab``, ``xlsx``, ``xls``, ``dta`` file extension types) will be \ 36 | read to/written from a :py:class:`pandas.DataFrame` object. The exceptions are `pkl` and `json` objects which \ 37 | accept any serializable Python object and correctly-formatted JSON object respectively. 38 | 39 | .. tip:: You can configure your environment to make it easier to automatically connect to S3 by defining the \ 40 | variable ``S3_BUCKET``. 41 | 42 | Usage:: 43 | 44 | from pewtils.io import FileHandler 45 | 46 | >>> h = FileHandler("./", use_s3=False) # current local folder 47 | >>> df = h.read("my_csv", format="csv") 48 | # Do something and save to Excel 49 | >>> h.write("my_new_csv", df, format="xlsx") 50 | 51 | >>> my_data = [{"key": "value"}] 52 | >>> h.write("my_data", my_data, format="json") 53 | 54 | >>> my_data = ["a", "python", "list"] 55 | >>> h.write("my_data", my_data, format="pkl") 56 | 57 | # To read/write to an S3 bucket 58 | # The FileHandler detects your AWS tokens using boto3's standard methods to find them in ~/.aws or defined as environment variables. 59 | >>> h = FileHandler("/my_folder", use_s3=True, bucket="my-bucket") 60 | """ 61 | 62 | def __init__(self, path, use_s3=None, bucket=None): 63 | self.bucket = os.environ.get("S3_BUCKET", None) if bucket is None else bucket 64 | self.path = path 65 | self.use_s3 = use_s3 if is_not_null(self.bucket) else False 66 | if self.use_s3: 67 | s3_params = {} 68 | self.s3 = boto3.client("s3") 69 | 70 | else: 71 | self.path = os.path.join(self.path) 72 | if not os.path.exists(self.path): 73 | try: 74 | os.makedirs(self.path) 75 | 76 | except Exception as e: 77 | print("Warning: couldn't make directory '{}'".format(self.path)) 78 | print(e) 79 | 80 | def iterate_path(self): 81 | 82 | """ 83 | Iterates over the directory and returns a list of filenames or S3 object keys 84 | 85 | :return: Yields a list of filenames or S3 keys 86 | :rtype: iterable 87 | 88 | Usage:: 89 | 90 | from pewtils.io import FileHandler 91 | 92 | >>> h = FileHandler("./", use_s3=False) 93 | >>> for file in h.iterate_path(): print(file) 94 | file1.csv 95 | file2.pkl 96 | file3.json 97 | 98 | """ 99 | 100 | if self.use_s3: 101 | for key in self.s3.list_objects(Bucket=self.bucket, Prefix=self.path)['Contents']: 102 | yield key["Key"] 103 | 104 | else: 105 | for f in scandir(self.path): 106 | yield f.name 107 | 108 | def clear_folder(self): 109 | """ 110 | Deletes the path (if local) or unlinks all keys in the bucket folder (if S3) 111 | 112 | .. warning:: This is a destructive function, use with caution! 113 | 114 | Usage:: 115 | 116 | from pewtils.io import FileHandler 117 | 118 | >>> h = FileHandler("./", use_s3=False) 119 | >>> len(list(h.iterate_path())) 120 | 3 121 | >>> h.clear_folder() 122 | >>> len(list(h.iterate_path())) 123 | 0 124 | 125 | """ 126 | 127 | if self.use_s3: 128 | for key in self.s3.list_objects(Bucket=self.bucket, Prefix=self.path)['Contents']: 129 | self.s3.delete_object(Bucket=self.bucket, Prefix=key['Key']) 130 | 131 | else: 132 | for f in scandir(self.path): 133 | os.unlink(os.path.join(self.path, f.name)) 134 | 135 | def clear_file(self, key, format="pkl", hash_key=False): 136 | """ 137 | Deletes a specific file. 138 | 139 | .. warning:: This is a destructive function, use with caution! 140 | 141 | :param key: The name of the file to delete 142 | :type key: str 143 | :param format: The file extension 144 | :type format: str 145 | :param hash_key: If True, will hash the filename before looking it up; default is False. 146 | :type hash_key: bool 147 | 148 | Usage:: 149 | 150 | from pewtils.io import FileHandler 151 | 152 | >>> h = FileHandler("./", use_s3=False) 153 | >>> for file in h.iterate_path(): print(file) 154 | file1.csv 155 | file2.pkl 156 | file3.json 157 | >>> h.clear_file("file1", format="csv") 158 | >>> for file in h.iterate_path(): print(file) 159 | file2.pkl 160 | file3.json 161 | 162 | """ 163 | 164 | if hash_key: 165 | key = self.get_key_hash(key) 166 | 167 | if self.use_s3: 168 | filepath = "/".join([self.path, "{}.{}".format(key, format)]) 169 | key = self.s3.delete_object(Bucket=self.bucket, Key=filepath) 170 | 171 | else: 172 | key += ".{}".format(format) 173 | path = os.path.join(self.path, key) 174 | os.unlink(path) 175 | 176 | def get_key_hash(self, key): 177 | 178 | """ 179 | Converts a key to a hashed representation. Allows you to pass arbitrary objects and convert their string \ 180 | representation into a shorter hashed key, so it can be useful for caching. You can call this method \ 181 | directly to see the hash that a key will be converted into, but this method is mainly used in conjunction \ 182 | with the :py:meth:`pewtils.FileHandler.write` and :py:meth:`pewtils.FileHandler.read` methods by passing in \ 183 | ``hash_key=True``. 184 | 185 | :param key: A raw string or Python object that can be meaningfully converted into a string representation 186 | :type key: str or object 187 | :return: A SHA224 hash representation of that key 188 | :rtype: str 189 | 190 | Usage:: 191 | 192 | from pewtils.io import FileHandler 193 | 194 | >>> h = FileHandler("tests/files", use_s3=False) 195 | >>> h.get_key_hash("temp") 196 | "c51bf90ccb22befa316b7a561fe9d5fd9650180b14421fc6d71bcd57" 197 | >>> h.get_key_hash({"key": "value"}) 198 | "37e13e1116c86a6e9f3f8926375c7cb977ca74d2d598572ced03cd09" 199 | 200 | """ 201 | 202 | try: 203 | return hashlib.sha224(key.encode("utf8")).hexdigest() 204 | except AttributeError: 205 | return hashlib.sha224(str(key).encode("utf8")).hexdigest() 206 | 207 | def write( 208 | self, key, data, format="pkl", hash_key=False, add_timestamp=False, **io_kwargs 209 | ): 210 | 211 | """ 212 | Writes arbitrary data objects to a variety of file formats. 213 | 214 | 215 | :param key: The name of the file or key (without a file suffix!) 216 | :type key: str 217 | :param data: The actual data to write to the file 218 | :type data: object 219 | :param format: The format the data should be saved in (pkl/csv/tab/xlsx/xls/dta/json). Defaults to pkl. \ 220 | This will be used as the file's suffix. 221 | :type format: str 222 | :param hash_key: Whether or not to hash the provided key before saving the file. (Default=False) 223 | :type hash_key: bool 224 | :param add_timestamp: Optionally add a timestamp to the filename 225 | :type add_timestamp: bool 226 | :param io_kwargs: Additional parameters to pass along to the Pandas save function, if applicable 227 | :return: None 228 | 229 | .. note:: When saving a ``csv``, ``tab``, ``xlsx``, ``xls``, or ``dta`` file, this function expects to receive a \ 230 | Pandas :py:class:`pandas.DataFrame`. When you use these formats, you can also pass optional ``io_kwargs`` \ 231 | which will be forwarded to the corresponding :py:mod:`pandas` method below: 232 | 233 | - `dta`: :py:meth:`pandas.DataFrame.to_stata` 234 | - `csv`: :py:meth:`pandas.DataFrame.to_csv` 235 | - `tab`: :py:meth:`pandas.DataFrame.to_csv` 236 | - `xlsx`: :py:meth:`pandas.DataFrame.to_excel` 237 | - `xls`: :py:meth:`pandas.DataFrame.to_excel` 238 | 239 | If you're trying to save an object to JSON, it assumes that you're passing it valid JSON. By default, \ 240 | the handler attempts to use pickling, allowing you to save anything you want, as long as it's serializable. 241 | 242 | """ 243 | 244 | format = format.strip(".") 245 | 246 | if hash_key: 247 | key = self.get_key_hash(key) 248 | 249 | if add_timestamp: 250 | key = "{}_{}".format(key, datetime.datetime.now()) 251 | 252 | def _get_output(output, data, io_kwargs): 253 | if format == "tab": 254 | io_kwargs["sep"] = "\t" 255 | if format in ["csv", "tab"]: 256 | data.to_csv(output, encoding="utf8", **io_kwargs) 257 | elif format == "dta": 258 | data.to_stata(output, **io_kwargs) 259 | elif format in ["xls", "xlsx"]: 260 | writer = pd.ExcelWriter(output, engine="xlsxwriter") 261 | data.to_excel(writer, **io_kwargs) 262 | writer.save() 263 | data = output.getvalue() 264 | return data 265 | 266 | if format in ["csv", "xls", "xlsx", "tab", "dta"]: 267 | try: 268 | data = _get_output(BytesIO(), data, io_kwargs) 269 | except Exception as e: 270 | try: 271 | data = _get_output(StringIO(), data, io_kwargs) 272 | except: 273 | raise Exception( 274 | "Couldn't convert data into '{}' format".format(format) 275 | ) 276 | 277 | elif format == "pkl": 278 | data = pickle.dumps(data, **io_kwargs) 279 | elif format == "json": 280 | data = json.dumps(data, **io_kwargs) 281 | 282 | key += ".{}".format(format) 283 | 284 | if self.use_s3: 285 | try: 286 | upload = BytesIO(data) 287 | 288 | except TypeError: 289 | upload = BytesIO(data.encode()) 290 | 291 | self.s3.upload_fileobj(upload, Bucket=self.bucket, Key="/".join([self.path, key])) 292 | 293 | else: 294 | path = os.path.join(self.path, key) 295 | if os.path.exists(self.path): 296 | try: 297 | with closing(open(path, "w")) as output: 298 | output.write(data) 299 | except: 300 | with closing(open(path, "wb")) as output: 301 | output.write(data) 302 | 303 | def read(self, key, format="pkl", hash_key=False, **io_kwargs): 304 | 305 | """ 306 | Reads a file from the directory or S3 path, returning its contents. 307 | 308 | :param key: The name of the file to read (without a suffix!) 309 | :type key: str 310 | :param format: The format of the file (pkl/json/csv/dta/xls/xlsx/tab); expects the file extension to match 311 | :type format: str 312 | :param hash_key: Whether the key should be hashed prior to looking for and retrieving the file. 313 | :type hash_key: bool 314 | :param io_kwargs: Optional arguments to be passed to the specific load function (dependent on file format) 315 | :return: The file contents, in the requested format 316 | 317 | .. note:: You can pass optional ``io_kwargs`` that will be forwarded to the function below that corresponds to \ 318 | the format of the file you're trying to read in 319 | 320 | - `dta`: :py:meth:`pandas.DataFrame.read_stata` 321 | - `csv`: :py:meth:`pandas.DataFrame.read_csv` 322 | - `tab`: :py:meth:`pandas.DataFrame.read_csv` 323 | - `xlsx`: :py:meth:`pandas.DataFrame.read_excel` 324 | - `xls`: :py:meth:`pandas.DataFrame.read_excel` 325 | """ 326 | 327 | format = format.strip(".") 328 | 329 | if hash_key: 330 | key = self.get_key_hash(key) 331 | 332 | data = None 333 | filepath = "/".join([self.path, "{}.{}".format(key, format)]) 334 | 335 | if self.use_s3: 336 | try: 337 | data = StringIO() 338 | 339 | except TypeError: 340 | data = BytesIO() 341 | 342 | self.s3.download_fileobj(data, Bucket=self.bucket, Key=filepath) 343 | data = data.getvalue() 344 | else: 345 | if os.path.exists(filepath): 346 | try: 347 | with closing(open(filepath, "r")) as infile: 348 | data = infile.read() 349 | 350 | except: 351 | # TODO: handle this exception more explicitly 352 | with closing(open(filepath, "rb")) as infile: 353 | data = infile.read() 354 | 355 | if is_not_null(data): 356 | if format == "pkl": 357 | try: 358 | data = pickle.loads(data) 359 | 360 | except TypeError: 361 | data = None 362 | 363 | except ValueError: 364 | if "attempt_count" not in io_kwargs: 365 | io_kwargs["attempt_count"] = 1 366 | 367 | print( 368 | "Insecure pickle string; probably a concurrent read-write, \ 369 | will try again in 5 seconds (attempt #{})".format( 370 | io_kwargs["attempt_count"] 371 | ) 372 | ) 373 | time.sleep(5) 374 | 375 | if io_kwargs["attempt_count"] <= 3: 376 | io_kwargs["attempt_count"] += 1 377 | data = self.read( 378 | key, format=format, hash_key=hash_key, **io_kwargs 379 | ) 380 | 381 | else: 382 | data = None 383 | 384 | except Exception as e: 385 | print("Couldn't load pickle! {}".format(e)) 386 | data = None 387 | 388 | elif format in ["tab", "csv"]: 389 | if format == "tab": 390 | io_kwargs["delimiter"] = "\t" 391 | 392 | try: 393 | data = pd.read_csv(BytesIO(data), **io_kwargs) 394 | 395 | except: 396 | data = pd.read_csv(StringIO(data), **io_kwargs) 397 | 398 | elif format in ["xlsx", "xls"]: 399 | # https://stackoverflow.com/questions/64264563/attributeerror-elementtree-object-has-no-attribute-getiterator-when-trying 400 | if "engine" not in io_kwargs: 401 | io_kwargs["engine"] = "openpyxl" 402 | 403 | try: 404 | data = pd.read_excel(BytesIO(data), **io_kwargs) 405 | 406 | except: 407 | data = pd.read_excel(StringIO(data), **io_kwargs) 408 | 409 | elif format == "json": 410 | try: 411 | data = json.loads(data) 412 | 413 | except: 414 | pass 415 | 416 | elif format == "dta": 417 | try: 418 | data = pd.read_stata(BytesIO(data), **io_kwargs) 419 | 420 | except: 421 | data = pd.read_stata(StringIO(data), **io_kwargs) 422 | 423 | elif format == "txt": 424 | if isinstance(data, bytes): 425 | data = data.decode() 426 | 427 | return data 428 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. -------------------------------------------------------------------------------- /pewtils/http.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from bs4 import BeautifulSoup 3 | from builtins import str 4 | from pewtils import get_hash, decode_text, is_not_null 5 | from six.moves.urllib import parse as urlparse 6 | from unidecode import unidecode 7 | import pandas as pd 8 | import re 9 | import os 10 | import requests 11 | import tldextract 12 | import warnings 13 | from requests.exceptions import ReadTimeout 14 | from stopit import ThreadingTimeout as Timeout 15 | 16 | 17 | _ = pd.read_csv( 18 | os.path.join( 19 | os.path.dirname(os.path.abspath(__file__)), "general_link_shorteners.csv" 20 | ) 21 | ) 22 | GENERAL_LINK_SHORTENERS = _["shortener"].values 23 | 24 | 25 | _ = pd.read_csv( 26 | os.path.join( 27 | os.path.dirname(os.path.abspath(__file__)), "vanity_link_shorteners.csv" 28 | ) 29 | ) 30 | _ = _[_["historical"] == 0] 31 | VANITY_LINK_SHORTENERS = dict(zip(_["shortener"], _["expanded"])) 32 | 33 | _ = pd.read_csv( 34 | os.path.join( 35 | os.path.dirname(os.path.abspath(__file__)), "vanity_link_shorteners.csv" 36 | ) 37 | ) 38 | _ = _[_["historical"] == 1] 39 | HISTORICAL_VANITY_LINK_SHORTENERS = dict(zip(_["shortener"], _["expanded"])) 40 | 41 | VANITY_LINK_SHORTENERS.update(HISTORICAL_VANITY_LINK_SHORTENERS) 42 | 43 | 44 | def hash_url(url): 45 | 46 | """ 47 | Clears out http/https prefix and returns an MD5 hash of the URL. More effective \ 48 | when used in conjunction with :py:func:`pewtils.http.canonical_link`. 49 | 50 | :param url: The URL to hash 51 | :type url: str 52 | :return: Hashed string representation of the URL using the md5 hashing algorithm. 53 | :rtype: str 54 | 55 | Usage:: 56 | 57 | from pewtils.http import hash_url 58 | 59 | >>> hash_url("http://www.example.com") 60 | "7c1767b30512b6003fd3c2e618a86522" 61 | >>> hash_url("www.example.com") 62 | "7c1767b30512b6003fd3c2e618a86522" 63 | 64 | """ 65 | 66 | http_regex = re.compile(r"^http(s)?\:\/\/") 67 | with warnings.catch_warnings(): 68 | warnings.simplefilter("ignore") 69 | result = get_hash( 70 | unidecode(http_regex.sub("", url.lower())), hash_function="md5" 71 | ) 72 | return result 73 | 74 | 75 | def strip_html(html, simple=False, break_tags=None): 76 | 77 | """ 78 | Attempts to strip out HTML code from an arbitrary string while preserving meaningful text components. \ 79 | By default, the function will use BeautifulSoup to parse the HTML. Setting ``simple=True`` will make the \ 80 | function use a much simpler regular expression approach to parsing. 81 | 82 | :param html: The HTML to process 83 | :type html: str 84 | :param simple: Whether or not to use a simple regex or more complex parsing rules (default=False) 85 | :type simple: bool 86 | :param break_tags: A custom list of tags on which to break (default is ["strong", "em", "i", "b", "p"]) 87 | :type break_tags: list 88 | :return: The text with HTML components removed 89 | :rtype: str 90 | 91 | .. note: This function might not be effective for *all* variations of HTML structures, but it produces fairly \ 92 | reliable results in removing the vast majority of HTML without stripping out valuable content. 93 | 94 | Usage:: 95 | 96 | from pewtils.http import strip_html 97 | 98 | >>> my_html = "Header textBody text" 99 | >>> strip_html(my_html) 100 | 'Header text Body text' 101 | 102 | """ 103 | 104 | html = re.sub(r"\n", " ", html) 105 | html = re.sub(r"\s+", " ", html) 106 | if not break_tags: 107 | break_tags = ["strong", "em", "i", "b", "p"] 108 | if not simple: 109 | try: 110 | 111 | split_re = re.compile(r"\s{2,}") 112 | soup = BeautifulSoup(html, "lxml") 113 | for tag in soup(): 114 | if ( 115 | "class" in tag.attrs 116 | and ("menu" in tag.attrs["class"] or "header" in tag.attrs["class"]) 117 | ) or ("menu" in str(tag.id) or "header" in str(tag.id)): 118 | tag.extract() 119 | for tag in soup(["script", "style"]): 120 | tag.extract() 121 | for br in soup.find_all("br"): 122 | br.replace_with("\n") 123 | for t in soup(break_tags): 124 | try: 125 | t.replace_with("\n{0}\n".format(t.text)) 126 | except (UnicodeDecodeError, UnicodeEncodeError): 127 | t.replace_with("\n{0}\n".format(decode_text(t.text))) 128 | if hasattr(soup, "body") and soup.body: 129 | text = soup.body.get_text() 130 | else: 131 | text = soup.get_text() 132 | lines = [l.strip() for l in text.splitlines()] 133 | lines = [l2.strip() for l in lines for l2 in split_re.split(l)] 134 | text = "\n".join([l for l in lines if l]) 135 | text = re.sub(r"(\sA){2,}\s", " ", text) 136 | text = re.sub(r"\n+(\s+)?", "\n\n", text) 137 | text = re.sub(r" +", " ", text) 138 | text = re.sub(r"\t+", " ", text) 139 | 140 | return text 141 | 142 | except Exception as e: 143 | 144 | print("strip_html error") 145 | print(e) 146 | text = re.sub(r"<[^>]*>", " ", re.sub("\\s+", " ", html)).strip() 147 | return text 148 | 149 | else: 150 | return "\n".join( 151 | [ 152 | re.sub(r"\s+", " ", re.sub(r"\<[^\>]+\>", " ", section)) 153 | for section in re.sub(r"\<\/?div\>|\<\/?p\>|\", "\n", html).split( 154 | "\n" 155 | ) 156 | ] 157 | ) 158 | 159 | 160 | def trim_get_parameters(url, session=None, timeout=30, user_agent=None): 161 | 162 | """ 163 | Takes a URL (presumed to be the final end point) and iterates over GET parameters, attempting to find optional 164 | ones that can be removed without generating any redirects. 165 | 166 | :param url: The URL to trim 167 | :type url: str 168 | :param session: (Optional) A persistent session that can optionally be passed (useful if you're processing many \ 169 | links at once) 170 | :type session: :py:class:`requests.Session` object 171 | :param user_agent: User agent for the auto-created requests Session to use, if a preconfigured requests Session \ 172 | is not provided 173 | :type user_agent: str 174 | :param timeout: Timeout for requests 175 | :type timeout: int or float 176 | :return: The original URL with optional GET parameters removed 177 | :rtype: str 178 | 179 | Usage:: 180 | 181 | from pewtils.http import trim_get_parameters 182 | 183 | >>> trim_get_parameters("https://httpbin.org/status/200?param=1") 184 | "https://httpbin.org/status/200" 185 | 186 | """ 187 | 188 | close_session = False 189 | if not session: 190 | close_session = True 191 | session = requests.Session() 192 | session.headers.update({"User-Agent": user_agent}) 193 | 194 | # Often there's extra information about social sharing and referral sources that can be removed 195 | ditch_params = [] 196 | parsed = urlparse.urlparse(url) 197 | if parsed.query: 198 | params = urlparse.parse_qs(parsed.query) 199 | for k, v in params.items(): 200 | # We iterate over all of the GET parameters and try holding each one out 201 | check = True 202 | for skipper in ["document", "article", "id", "qs"]: 203 | # If the parameter is named something that's probably a unique ID, we'll keep it 204 | if skipper in k.lower(): 205 | check = False 206 | for skipper in ["html", "http"]: 207 | # Same goes for parameters that contain URL information 208 | if skipper in v[0].lower(): 209 | check = False 210 | if check: 211 | new_params = { 212 | k2: v2[0] for k2, v2 in params.items() if k2 != k and len(v2) == 1 213 | } 214 | new_params = urlparse.urlencode(new_params) 215 | new_parsed = parsed._replace(query=new_params) 216 | new_url = urlparse.urlunparse(new_parsed) 217 | try: 218 | resp = session.head(new_url, allow_redirects=True, timeout=timeout) 219 | except ReadTimeout: 220 | resp = None 221 | if is_not_null(resp): 222 | new_parsed = urlparse.urlparse(resp.url) 223 | if new_parsed.query != "" or new_parsed.path not in ["", "/"]: 224 | # If removing a parameter didn't redirect to a root domain... 225 | new_url = resp.url 226 | compare_new = ( 227 | new_url.split("?")[0] if "?" in new_url else new_url 228 | ) 229 | compare_old = url.split("?")[0] if "?" in url else url 230 | if compare_new == compare_old: 231 | # And the domain is the same as it was before, then the parameter was probably unnecessary 232 | ditch_params.append(k) 233 | 234 | if len(ditch_params) > 0: 235 | # Now we remove all of the unnecessary get parameters and finalize the URL 236 | new_params = { 237 | k: v[0] for k, v in params.items() if len(v) == 1 and k not in ditch_params 238 | } 239 | new_params = urlparse.urlencode(new_params) 240 | parsed = parsed._replace(query=new_params) 241 | url = urlparse.urlunparse(parsed) 242 | 243 | if close_session: 244 | session.close() 245 | 246 | return url 247 | 248 | 249 | def extract_domain_from_url( 250 | url, 251 | include_subdomain=True, 252 | resolve_url=False, 253 | timeout=1.0, 254 | session=None, 255 | user_agent=None, 256 | expand_shorteners=True, 257 | ): 258 | 259 | """ 260 | Attempts to extract a standardized domain from a url by following the link and extracting the TLD. 261 | 262 | :param url: The link from which to extract the domain 263 | :type url: str 264 | :param include_subdomain: Whether or not to include the subdomain (e.g. 'news.google.com'); default is True 265 | :type include_subdomain: bool 266 | :param resolve_url: Whether to fully resolve the URL. If False (default), it will operate on the URL as-is; if \ 267 | True, the URL will be passed to :py:func:`pewtils.http.canonical_link` to be standardized prior to extracting the \ 268 | domain. 269 | :param timeout: (Optional, for use with ``resolve_url``) Maximum number of seconds to wait on a request before \ 270 | timing out (default is 1) 271 | :type timeout: int or float 272 | :param session: (Optional, for use with ``resolve_url``) A persistent session that can optionally be passed \ 273 | (useful if you're processing many links at once) 274 | :type session: :py:class:`requests.Session` object 275 | :param user_agent: (Optional, for use with ``resolve_url``) User agent for the auto-created requests Session to use, \ 276 | if a preconfigured requests Session is not provided 277 | :type user_agent: str 278 | :param expand_shorteners: If True, shortened URLs that don't successfully expand will be checked against a list \ 279 | of known URL shorteners and expanded if recognized. (Default = True) 280 | :type expand_shorteners: bool 281 | :return: The domain for the link 282 | :rtype: str 283 | 284 | .. note:: If ``resolve_url`` is set to True, the link will be standardized prior to domain extraction (in which \ 285 | case you can provide optional timeout, session, and user_agent parameters that will be passed to \ 286 | :py:func:`pewtils.http.canonical_link`). By default, however, the link will be operated on as-is. The final \ 287 | extracted domain is then checked against known URL shorteners (see :ref:`vanity_link_shorteners`) and if it \ 288 | is recognized, the expanded domain will be returned instead. Shortened URLs that are not standardized and \ 289 | do not follow patterns included in this dictionary of known shorteners may be returned with an incorrect domain. 290 | 291 | Usage:: 292 | 293 | from pewtils.http import extract_domain_from_url 294 | 295 | >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=False) 296 | "bbc.co.uk" 297 | >>> extract_domain_from_url("http://forums.bbc.co.uk", include_subdomain=True) 298 | "forums.bbc.co.uk" 299 | 300 | """ 301 | 302 | if resolve_url: 303 | url = canonical_link( 304 | url, timeout=timeout, session=session, user_agent=user_agent 305 | ) 306 | domain = tldextract.extract(url) 307 | if domain: 308 | if include_subdomain and domain.subdomain and domain.subdomain != "www": 309 | domain = ".".join([domain.subdomain, domain.domain, domain.suffix]) 310 | else: 311 | domain = ".".join([domain.domain, domain.suffix]) 312 | if expand_shorteners: 313 | domain = VANITY_LINK_SHORTENERS.get(domain, domain) 314 | return domain 315 | 316 | 317 | def canonical_link(url, timeout=5.0, session=None, user_agent=None): 318 | 319 | """ 320 | Tries to resolve a link to the "most correct" version. 321 | 322 | Useful for expanding short URLs from bit.ly / Twitter and for checking HTTP status codes without retrieving \ 323 | the actual data. Follows redirects and tries to pick the most informative version of a URL while avoiding \ 324 | redirects to generic 404 pages. Also tries to iteratively remove optional GET parameters. 325 | 326 | May not be particularly effective on dead links, but may still be able to follow redirects enough \ 327 | to return a URL with the correct domain associated with the original link. 328 | 329 | :param url: The URL to test. Should be fully qualified. 330 | :type url: str 331 | :param timeout: How long to wait for a response before giving up (default is one second) 332 | :type timeout: int or float 333 | :param session: (Optional) A persistent session that can optionally be passed (useful if you're processing many \ 334 | links at once) 335 | :type session: :py:class:`requests.Session` object 336 | :param user_agent: User agent for the auto-created requests Session to use, if a preconfigured requests Session \ 337 | is not provided 338 | :type user_agent: str 339 | :return: The "canonical" URL as supplied by the server, or the original URL if none supplied. 340 | :rtype: str 341 | 342 | .. note:: See :ref:`link_shorteners` for a complete list of shortened links recognized by this function. 343 | 344 | This function might not resolve *all* existing URL modificiations, but it has been tested on a vast, well \ 345 | maintained variety of URLs. It typically resolves URL to the correct final page while avoiding redirects to \ 346 | generic error pages. 347 | 348 | Usage:: 349 | 350 | from pewtils.http import canonical_link 351 | 352 | >>> canonical_link("https://pewrsr.ch/2lxB0EX") 353 | "https://www.pewresearch.org/interactives/how-does-a-computer-see-gender/" 354 | 355 | """ 356 | 357 | BAD_STATUS_CODES = [ 358 | 302, 359 | 307, 360 | 400, 361 | 404, 362 | 405, 363 | 407, 364 | 500, 365 | 501, 366 | 502, 367 | 503, 368 | 504, 369 | 520, 370 | 530, 371 | 404, 372 | ] 373 | PROXY_REQUIRED = [307, 407] 374 | CHECK_LENGTH = [301, 302, 200, 404] 375 | 376 | close_session = False 377 | if not session: 378 | close_session = True 379 | session = requests.Session() 380 | session.headers.update({"User-Agent": user_agent}) 381 | if not url.startswith("http"): 382 | url = "http://" + url 383 | response = None 384 | try: 385 | with Timeout(timeout): 386 | try: 387 | response = session.head(url, allow_redirects=True, timeout=timeout) 388 | except requests.ConnectionError: 389 | try: 390 | response = session.head(url, allow_redirects=False, timeout=timeout) 391 | except: 392 | pass 393 | except: 394 | pass 395 | 396 | if response: 397 | 398 | history = [(h.status_code, h.url) for h in response.history] 399 | history.append((response.status_code, response.url)) 400 | 401 | last_good_url = history[0][1] 402 | original_parsed = urlparse.urlparse(last_good_url) 403 | has_path = original_parsed.path not in ["/", ""] 404 | has_query = original_parsed.query != "" 405 | prev_was_shortener = False 406 | prev_path = None 407 | prev_query = None 408 | status_code = None 409 | for i, resp in enumerate(history): 410 | status_code, response_url = resp 411 | if "errors/404" in response_url: 412 | # If it's clearly a 404 landing page, stop and use the last observed good URL 413 | break 414 | parsed = urlparse.urlparse(response_url) 415 | if ( 416 | parsed.netloc in VANITY_LINK_SHORTENERS.keys() 417 | or parsed.netloc in GENERAL_LINK_SHORTENERS 418 | ): 419 | # Don't consider known shortened URLs 420 | is_shortener = True 421 | else: 422 | is_shortener = False 423 | if not is_shortener: 424 | if i != 0: 425 | for param, val in urlparse.parse_qs(parsed.query).items(): 426 | if len(val) == 1 and val[0].startswith("http"): 427 | parsed_possible_url = urlparse.urlparse(val[0]) 428 | if ( 429 | parsed_possible_url.scheme 430 | and parsed_possible_url.netloc 431 | ): 432 | # If the URL contains a GET parameter that is, itself, a URL, it's likely redirecting 433 | # to it, so we're going to stop this run and start the process over with the new URL 434 | return canonical_link( 435 | val[0], 436 | timeout=timeout, 437 | session=session, 438 | user_agent=user_agent, 439 | ) 440 | if status_code in PROXY_REQUIRED: 441 | # These codes tend to indicate the last good URL in the chain 442 | last_good_url = response_url 443 | break 444 | good_path = not has_path or parsed.path not in ["/", ""] 445 | good_query = not has_query or parsed.query != "" 446 | # If the URL has a path or some GET parameters, we'll inspect further 447 | # Otherwise we just go with the previous URL 448 | # Link shorteners are very rarely used to reference root domains 449 | if good_query or good_path: 450 | if ( 451 | re.sub("https", "http", response_url) 452 | == re.sub("https", "http", last_good_url) 453 | or parsed.path == original_parsed.path 454 | ) or response_url.lower() == last_good_url.lower(): 455 | # If it's the same link but only the domain, protocol, or casing changed, it's fine 456 | last_good_url = response_url 457 | elif i != 0 and status_code in CHECK_LENGTH: 458 | # For these codes, we're going to see how much the link changed 459 | # Redirects and 404s sometimes preserve a decent URL, sometimes they go to a landing page 460 | # The following cutoffs seem to do a good job most of the time: 461 | # 1) The new URL has a long domain more than 7 characters, so it's not likely a shortened URL 462 | # 2) The prior URL had a long path and this one has fewer than 20 characters and it wasn't 463 | # swapped out for GET params 464 | # 3) Or the prior URL had GET params and this one has far fewer and no replacement path 465 | # If these conditions are met and the path or query do not identically match the prior link 466 | # Then it's usually a generic error page 467 | bad = False 468 | if ( 469 | has_path 470 | and len(parsed.netloc) > 7 471 | and len(parsed.path) < 20 472 | and len(parsed.query) == 0 473 | and prev_path != parsed.path 474 | ) or ( 475 | has_query 476 | and len(parsed.netloc) > 7 477 | and len(parsed.query) < 20 478 | and len(parsed.path) <= 1 479 | and prev_query != parsed.query 480 | ): 481 | bad = True 482 | if not bad or prev_was_shortener: 483 | last_good_url = response_url 484 | # print("GOOD: {}, {}".format(status_code, response_url)) 485 | else: 486 | # These can sometimes resolve further though, so we continue onward 487 | prev_path = None 488 | prev_query = None 489 | else: 490 | if status_code not in BAD_STATUS_CODES: 491 | last_good_url = response_url 492 | else: 493 | break 494 | else: 495 | # Resolved to a general URL 496 | break 497 | 498 | prev_was_shortener = is_shortener 499 | prev_path = parsed.path 500 | prev_query = parsed.query 501 | 502 | if status_code not in BAD_STATUS_CODES: 503 | # If the URL ended on a good status code, we'll try to trim out any unnecessary GET parameters 504 | last_good_url = trim_get_parameters( 505 | last_good_url, session=session, timeout=timeout, user_agent=user_agent 506 | ) 507 | 508 | url = last_good_url 509 | 510 | if close_session: 511 | session.close() 512 | 513 | return url 514 | -------------------------------------------------------------------------------- /pewtils/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | import chardet 3 | import copy 4 | import json 5 | import imp 6 | import multiprocessing 7 | import os 8 | import re 9 | import signal 10 | import sys 11 | import time 12 | import warnings 13 | import zipcodes 14 | 15 | try: 16 | from importlib.machinery import SourceFileLoader 17 | except ImportError: 18 | import imp 19 | 20 | import pandas as pd 21 | import numpy as np 22 | 23 | from contextlib import closing 24 | from hashlib import md5 25 | from random import uniform 26 | from scandir import walk 27 | from unidecode import unidecode 28 | 29 | 30 | class classproperty(object): 31 | 32 | """ 33 | This decorator allows you to define functions on a class that are accessible directly from the 34 | class itself (rather than an instance of the class). It allows you to access ``classproperty`` 35 | attributes directly, such as ``obj.property``, rather than as a function on a class instance 36 | (like ``obj = Obj(); obj.property()``). 37 | 38 | Borrowed from a StackOverflow `post `_. 39 | 40 | Usage:: 41 | 42 | from pewtils import classproperty 43 | 44 | class MyClass(object): 45 | x = 4 46 | 47 | @classproperty 48 | def number(cls): 49 | return cls.x 50 | 51 | >>> MyClass().number 52 | 4 53 | >>> MyClass.number 54 | 4 55 | """ 56 | 57 | def __init__(self, fget): 58 | self.fget = fget 59 | 60 | def __get__(self, owner_self, owner_cls): 61 | return self.fget(owner_cls) 62 | 63 | 64 | def is_not_null(val, empty_lists_are_null=False, custom_nulls=None): 65 | 66 | """ 67 | Checks whether the value is null, using a variety of potential string values, etc. The following values are always 68 | considered null: ``numpy.nan, None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"`` 69 | 70 | :param val: The value to check 71 | :param empty_lists_are_null: Whether or not an empty list or :py:class:`pandas.DataFrame` should be considered \ 72 | null (default=False) 73 | :type empty_lists_are_null: bool 74 | :param custom_nulls: an optional list of additional values to consider as null 75 | :type custom_nulls: list 76 | :return: True if the value is not null 77 | :rtype: bool 78 | 79 | Usage:: 80 | 81 | from pewtils import is_not_null 82 | 83 | >>> text = "Hello" 84 | >>> is_not_null(text) 85 | True 86 | """ 87 | 88 | null_values = [None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"] 89 | if custom_nulls: 90 | null_values.extend(custom_nulls) 91 | if type(val) == list: 92 | if empty_lists_are_null and val == []: 93 | return False 94 | else: 95 | return True 96 | elif isinstance(val, pd.Series) or isinstance(val, pd.DataFrame): 97 | if empty_lists_are_null and len(val) == 0: 98 | return False 99 | else: 100 | return True 101 | else: 102 | try: 103 | try: 104 | good = val not in null_values 105 | if good: 106 | try: 107 | try: 108 | good = not pd.isnull(val) 109 | except IndexError: 110 | good = True 111 | except AttributeError: 112 | good = True 113 | return good 114 | except ValueError: 115 | return val.any() 116 | except TypeError: 117 | return not isinstance(val, None) 118 | 119 | 120 | def is_null(val, empty_lists_are_null=False, custom_nulls=None): 121 | 122 | """ 123 | Returns the opposite of the outcome of :py:func:`pewtils.is_not_null`. The following values are always \ 124 | considered null: ``numpy.nan, None, "None", "nan", "", " ", "NaN", "none", "n/a", "NONE", "N/A"`` 125 | 126 | :param val: The value to check 127 | :param empty_lists_are_null: Whether or not an empty list or :py:class:`pandas.DataFrame` should be considered \ 128 | null (default=False) 129 | :type empty_lists_are_null: bool 130 | :param custom_nulls: an optional list of additional values to consider as null 131 | :type custom_nulls: list 132 | :return: True if the value is null 133 | :rtype: bool 134 | 135 | Usage:: 136 | 137 | from pewtils import is_null 138 | 139 | >>> empty_list = [] 140 | >>> is_null(empty_list, empty_lists_are_null=True) 141 | True 142 | """ 143 | 144 | return not is_not_null( 145 | val, empty_lists_are_null=empty_lists_are_null, custom_nulls=custom_nulls 146 | ) 147 | 148 | 149 | def decode_text(text, throw_loud_fail=False): 150 | 151 | """ 152 | Attempts to decode and re-encode text as ASCII. In the case of failure, it will attempt to detect the string's \ 153 | encoding, decode it, and convert it to ASCII. If both these attempts fail, it will attempt to use the \ 154 | :py:mod:`unidecode` package to transliterate into ASCII. And finally, if that doesn't work, it will forcibly \ 155 | encode the text as ASCII and ignore non-ASCII characters. 156 | 157 | .. warning:: This function is potentially destructive to source input and should be used with some care. \ 158 | Input text that cannot be decoded may be stripped out, or replaced with a similar ASCII character or other \ 159 | placeholder, potentially resulting in an empty string. 160 | 161 | :param text: The text to process 162 | :type text: str 163 | :param throw_loud_fail: If True, exceptions will be raised, otherwise the function will fail silently and \ 164 | return an empty string (default False) 165 | :type throw_loud_fail: bool 166 | :return: Decoded text, or empty string 167 | :rtype: str 168 | 169 | .. note:: In Python 3, the decode/encode attempts will fail by default, and the :py:mod:`unidecode` package will \ 170 | be used to transliterate. In general, you shouldn't need to use this function in Python 3, but it shouldn't \ 171 | hurt anything if you do. 172 | 173 | """ 174 | 175 | output_text = "" 176 | with warnings.catch_warnings(): 177 | warnings.simplefilter("ignore") 178 | if is_not_null(text): 179 | try: 180 | text = u"{}".format(text) 181 | output_text = text.decode("ascii").encode("ascii") 182 | except (AttributeError, TypeError, UnicodeEncodeError, UnicodeDecodeError): 183 | try: 184 | output_text = text.decode(chardet.detect(text)["encoding"]) 185 | output_text = output_text.encode("ascii") 186 | except ( 187 | AttributeError, 188 | TypeError, 189 | UnicodeEncodeError, 190 | UnicodeDecodeError, 191 | ): 192 | try: 193 | output_text = unidecode(text) 194 | except ( 195 | AttributeError, 196 | TypeError, 197 | UnicodeEncodeError, 198 | UnicodeDecodeError, 199 | ): 200 | if throw_loud_fail: 201 | output_text = text.decode("ascii", "ignore").encode("ascii") 202 | else: 203 | try: 204 | output_text = text.decode("ascii", "ignore").encode( 205 | "ascii" 206 | ) 207 | except ( 208 | AttributeError, 209 | TypeError, 210 | UnicodeEncodeError, 211 | UnicodeDecodeError, 212 | ): 213 | print("Could not decode") 214 | print(text) 215 | output_text = output_text.replace("\x00", "").replace("\u0000", "") 216 | 217 | return output_text 218 | 219 | 220 | def get_hash(text, hash_function="ssdeep"): 221 | 222 | """ 223 | Generates hashed text using one of several available hashing functions. 224 | 225 | :param text: The string to hash 226 | :type text: str 227 | :param hash_function: The specific algorithm to use; options are ``'nilsimsa'``, ``'md5'``, and ``'ssdeep'`` \ 228 | (default) 229 | :type hash_function: str 230 | :return: A hashed representation of the provided string 231 | :rtype: str 232 | 233 | .. note:: The string will be passed through :py:func:`pewtils.decode_text` and the returned value will be used \ 234 | instead of the original value if it runs successfully, in order to ensure consistent hashing in both Python 2 and \ 235 | 3. By default the function uses the :py:mod:`ssdeep` algorithm, which generates context-sensitive hashes that are \ 236 | useful for computing document similarities at scale. 237 | 238 | .. note:: Using `hash_function='ssdeep'` requires the :py:mod:`ssdeep` library, which is not installed by default \ 239 | because it requires the installation of additional system libraries on certain operating systems. For help \ 240 | installing ssdeep, refer to the pewtils documentation installation section, which provides OS-specific instructions. 241 | 242 | Usage:: 243 | 244 | from pewtils import get_hash 245 | 246 | >>> text = 'test_string' 247 | >>> get_hash(text) 248 | '3:HI2:Hl' 249 | """ 250 | 251 | decoded_text = decode_text(text).encode("utf8").strip() 252 | if decoded_text == "": 253 | decoded_text = text 254 | text = decoded_text 255 | if hash_function == "nilsimsa": 256 | from nilsimsa import Nilsimsa 257 | 258 | hashed = Nilsimsa(text).hexdigest() 259 | elif hash_function == "md5": 260 | hashed = md5(text).hexdigest() 261 | else: 262 | try: 263 | import ssdeep 264 | except ImportError: 265 | raise Exception( 266 | """ 267 | To use get_hash with hash_function='ssdeep' you need to install the ssdeep package. Try running: 268 | >> BUILD_LIB=1 pip install ssdeep 269 | If you encounter installation problems, refer to the pewtils documentation for troubleshooting help. 270 | """ 271 | ) 272 | hashed = ssdeep.hash(text) 273 | 274 | return hashed 275 | 276 | 277 | def zipcode_num_to_string(zipcode): 278 | 279 | """ 280 | Attempts to standardize a string/integer/float that contains a U.S. zipcode. Front-pads with zeroes and uses the \ 281 | :py:mod:`zipcodes` library to ensure that the zipcode is real. If the zipcode doesn't validate successfully, \ 282 | ``None`` will be returned. 283 | 284 | :param zip: Object that contains a sequence of digits (string, integer, float) 285 | :type zip: str or float or int 286 | :return: A 5-digit string, or None 287 | :rtype: str or NoneType 288 | 289 | Usage:: 290 | 291 | from pewtils import zipcode_num_to_string 292 | 293 | >>> zipcode_number = 6463 294 | >>> zipcode_num_to_string(zipcode_number) 295 | '06463' 296 | >>> not_zipcode_number = 345678 297 | >>> zipcode_num_to_string(not_zipcode_number) 298 | >>> 299 | """ 300 | 301 | if is_not_null(zipcode): 302 | 303 | try: 304 | zipcode = str(int(str(zipcode).strip()[:5].split(".")[0])) 305 | except (TypeError, ValueError): 306 | zipcode = None 307 | 308 | if zipcode: 309 | zipcode = zipcode.zfill(5) 310 | if zipcodes.is_real(zipcode): 311 | return zipcode 312 | else: 313 | return None 314 | else: 315 | 316 | zipcode = None 317 | 318 | return zipcode 319 | 320 | 321 | def concat_text(*args): 322 | 323 | """ 324 | A helper function for concatenating text values. Text values are passed through :py:func:`pewtils.decode_text` \ 325 | before concatenation. 326 | 327 | :param args: A list of text values that will be returned as a single space-separated string 328 | :type args: list 329 | :return: A single string of the values concatenated by spaces 330 | :rtype: str 331 | 332 | Usage:: 333 | 334 | from pewtils import concat_text 335 | 336 | >>> text_list = ['Hello', 'World', '!'] 337 | >>> concat_text(text_list) 338 | 'Hello World !' 339 | """ 340 | 341 | strs = [decode_text(arg) for arg in args if is_not_null(arg)] 342 | return " ".join(strs) if is_not_null(strs, empty_lists_are_null=True) else "" 343 | 344 | 345 | def vector_concat_text(*args): 346 | 347 | """ 348 | Takes a list of equal-length lists and returns a single list with the rows concatenated by spaces. Useful for \ 349 | merging multiple columns of text in Pandas. 350 | 351 | :param args: A list of lists or :py:class:`pandas.Series` s that contain text values 352 | :return: A single list or :py:class:`pandas.Series` with all of the text values for each row concatenated 353 | 354 | Usage with lists:: 355 | 356 | from pewtils import vector_concat_text 357 | 358 | >>> text_lists = ["one", "two", "three"], ["a", "b", "c"] 359 | >>> vector_concat_text(text_lists) 360 | ['one a', 'two b', 'three c'] 361 | 362 | Usage with Pandas:: 363 | 364 | import pandas as pd 365 | from pewtils import vector_concat_text 366 | 367 | df = pd.DataFrame([ 368 | {"text1": "one", "text2": "a"}, 369 | {"text1": "two", "text2": "b"}, 370 | {"text1": "three", "text2": "c"} 371 | ]) 372 | 373 | >>> df['text'] = vector_concat_text(df['text1'], df['text2']) 374 | >>> df['text'] 375 | 0 one a 376 | 1 two b 377 | 2 three c 378 | Name: text, dtype: object 379 | """ 380 | 381 | return np.vectorize(concat_text)(*args) 382 | 383 | 384 | def scale_range(old_val, old_min, old_max, new_min, new_max): 385 | 386 | """ 387 | Scales a value from one range to another. Useful for comparing values from different scales, for example. 388 | 389 | :param old_val: The value to convert 390 | :type old_val: int or float 391 | :param old_min: The minimum of the old range 392 | :type old_min: int or float 393 | :param old_max: The maximum of the old range 394 | :type old_max: int or float 395 | :param new_min: The minimum of the new range 396 | :type new_min: int or float 397 | :param new_max: The maximum of the new range 398 | :type new_max: int or float 399 | :return: Value equivalent from the new scale 400 | :rtype: float 401 | 402 | Usage:: 403 | 404 | from pewtils import scale_range 405 | 406 | >>> old_value = 5 407 | >>> scale_range(old_value, 0, 10, 0, 20) 408 | 10.0 409 | """ 410 | 411 | return ( 412 | ((float(old_val) - float(old_min)) * (float(new_max) - float(new_min))) 413 | / (float(old_max) - float(old_min)) 414 | ) + float(new_min) 415 | 416 | 417 | def new_random_number(attempt=1, minimum=1.0, maximum=10): 418 | 419 | """ 420 | Returns a random number between the boundary that exponentially increases with the number of ``attempt``. 421 | The upper bound is capped using the ``maximum`` parameter (default 10) but is otherwise determined by the 422 | function ``minimum * 2 ** attempt``. 423 | 424 | | In effect, this means that when ``attempt`` is 1, the number returned will be in the range of the minimum \ 425 | and twice the minimum's value. As you increase ``attempt``, the possible range of returned values expands \ 426 | exponentially until it hits the ``maximum`` ceiling. 427 | 428 | :param attempt: Increasing attempt will expand the upper-bound of the range from which the random number is drawn 429 | :type attempt: int 430 | :param minimum: The minimum allowed value that can be returned; must be greater than zero. 431 | :type minimum: int or float 432 | :param maximum: The maximum allowed value that can be returned; must be greater than ``minimum``. 433 | :type maximum: int or float 434 | :return: A random number drawn uniformly from across the range determined by the provided arguments. 435 | :rtype: float 436 | 437 | .. note:: One useful application of this function is rate limiting: a script can pause in between requests at a \ 438 | reasonably fast pace, but then moderate itself and pause for longer periods if it begins encountering errors, \ 439 | simply by increasing the ``attempt`` variable (hence its name). 440 | 441 | Usage:: 442 | 443 | from pewtils import new_random_number 444 | 445 | >>> new_random_number(attempt=1) 446 | 1.9835581813820642 447 | >>> new_random_number(attempt=2) 448 | 3.1022350739064 449 | """ 450 | 451 | return uniform(minimum, min(maximum, minimum * 2 ** attempt)) 452 | 453 | 454 | def chunk_list(seq, size): 455 | 456 | """ 457 | Takes a sequence and groups values into smaller lists based on the specified size. 458 | 459 | :param seq: List or a list-like iterable 460 | :type seq: list or iterable 461 | :param size: Desired size of each sublist 462 | :type size: int 463 | :return: A list of lists 464 | :rtype: list 465 | 466 | Usage:: 467 | 468 | from pewtils import chunk_list 469 | 470 | >>> number_sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] 471 | >>> chunk_list(number_sequence, 3) 472 | [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10]] 473 | """ 474 | 475 | return (seq[pos : (pos + size)] for pos in range(0, len(seq), size)) 476 | 477 | 478 | def flatten_list(l): 479 | 480 | """ 481 | Takes a list of lists and flattens it into a single list. Nice shortcut to avoid having to deal with list \ 482 | comprehension. 483 | 484 | :param l: A list of lists 485 | :type l: list 486 | :return: A flattened list of all of the elements contained in the original list of lists 487 | :rtype: list 488 | 489 | Usage:: 490 | 491 | from pewtils import flatten_list 492 | 493 | >>> nested_lists = [[1, 2, 3], [4, 5, 6]] 494 | >>> flatten_list(nested_lists) 495 | [1, 2, 3, 4, 5, 6] 496 | """ 497 | 498 | return [item for sublist in l for item in sublist] 499 | 500 | 501 | def scan_dictionary(search_dict, field): 502 | 503 | """ 504 | Takes a dictionary with nested lists and dictionaries, and searches recursively for a specific key. Since keys can 505 | occur more than once, the function returns a list of all of the found values along with a list of equal length 506 | that specifies the nested key path to each value. 507 | 508 | :param search_dict: The dictionary to search 509 | :type search_dict: dict 510 | :param field: The field to find 511 | :type field: str 512 | :return: A tuple of the found values and file path-style strings representing their locations 513 | :rtype: tuple 514 | 515 | Usage:: 516 | 517 | from pewtils import scan_dictionary 518 | 519 | >>> test_dict = {"one": {"two": {"three": "four"}}} 520 | >>> scan_dictionary(test_dict, "three") 521 | (['four'], ['one/two/three/']) 522 | >>> scan_dictionary(test_dict, "five") 523 | ([], []) 524 | """ 525 | 526 | fields_found = [] 527 | key_path = [] 528 | 529 | for key, value in search_dict.items(): 530 | if key == field: 531 | fields_found.append(value) 532 | new_str = str(key) + "/" 533 | key_path.append(new_str) 534 | 535 | elif isinstance(value, dict): 536 | results, path = scan_dictionary(value, field) 537 | for result in results: 538 | fields_found.append(result) 539 | for road in path: 540 | new_str = str(key) + "/" + road 541 | key_path.append(new_str) 542 | 543 | elif isinstance(value, list): 544 | for item in value: 545 | if isinstance(item, dict): 546 | more_results, more_path = scan_dictionary(item, field) 547 | for another_result in more_results: 548 | fields_found.append(another_result) 549 | for another_road in more_path: 550 | new_str = str(key) + "/" + another_road 551 | key_path.append(new_str) 552 | 553 | return fields_found, key_path 554 | 555 | 556 | def recursive_update(existing, new): 557 | 558 | """ 559 | Takes an object and a dictionary representation of attributes and values, and recursively traverses through the 560 | new values and updates the object. 561 | 562 | | Regardless of whether or not the keys in the dictionary correspond to attribute names or dictionary keys; \ 563 | you can use this to iterate through a nested hierarchy of objects and dictionaries and update whatever you like. 564 | 565 | :param existing: An object or dictionary 566 | :type existing: dict or object 567 | :param new: A dictionary where keys correspond to the names of keys in the existing dictionary or attributes on \ 568 | the existing object 569 | :type new: dict or object 570 | :return: A copy of the original object or dictionary, with the values updated based on the provided map 571 | :rtype: dict or object 572 | 573 | Usage:: 574 | 575 | from pewtils import recursive_update 576 | 577 | class TestObject(object): 578 | def __init__(self, value): 579 | self.value = value 580 | self.dict = {"obj_key": "original"} 581 | def __repr__(self): 582 | return("TestObject(value='{}', dict={})".format(self.value, self.dict)) 583 | 584 | original = { 585 | "object": TestObject("original"), 586 | "key1": {"key2": "original"} 587 | } 588 | update = { 589 | "object": {"value": "updated", "dict": {"obj_key": "updated"}}, 590 | "key1": {"key3": "new"} 591 | } 592 | 593 | >>> recursive_update(original, update) 594 | {'object': TestObject(value='updated', dict={'obj_key': 'updated'}), 595 | 'key1': {'key2': 'original', 'key3': 'new'}} 596 | 597 | """ 598 | 599 | def _hasattr(obj, attr): 600 | if isinstance(obj, dict): 601 | return attr in obj 602 | else: 603 | return hasattr(obj, attr) 604 | 605 | def _setattr(obj, attr, val): 606 | if isinstance(obj, dict): 607 | obj[attr] = val 608 | else: 609 | setattr(obj, attr, val) 610 | return obj 611 | 612 | def _getattr(obj, attr): 613 | if isinstance(obj, dict): 614 | return obj[attr] 615 | else: 616 | return getattr(obj, attr) 617 | 618 | existing = copy.deepcopy(existing) 619 | if isinstance(new, dict): 620 | for k, v in new.items(): 621 | 622 | if _hasattr(existing, k): 623 | _setattr( 624 | existing, 625 | k, 626 | recursive_update(_getattr(existing, k), _getattr(new, k)), 627 | ) 628 | else: 629 | _setattr(existing, k, _getattr(new, k)) 630 | return existing 631 | else: 632 | return new 633 | 634 | 635 | def cached_series_mapper(series, function): 636 | 637 | """ 638 | Applies a function to all of the unique values in a :py:class:`pandas.Series` to avoid repeating the operation \ 639 | on duplicate values. 640 | 641 | | Great if you're doing database lookups or something computationally intensive on a column that may contain \ 642 | repeating values, etc. 643 | 644 | :param series: A :py:class:`pandas.Series` 645 | :type series: :py:class:`pandas.Series` 646 | :param function: A function to apply to values in the :py:class:`pandas.Series` 647 | :return: The resulting :py:class:`pandas.Series` 648 | :rtype: :py:class:`pandas.Series` 649 | 650 | Usage:: 651 | 652 | import pandas as pd 653 | from pewtils import cached_series_mapper 654 | 655 | values = ["value"]*10 656 | def my_function(x): 657 | print(x) 658 | return x 659 | 660 | df = pd.DataFrame(values, columns=['column']) 661 | >>> mapped = df['column'].map(my_function) 662 | value 663 | value 664 | value 665 | value 666 | value 667 | value 668 | value 669 | value 670 | value 671 | value 672 | >>> mapped = cached_series_mapper(df['column'], my_function) 673 | value 674 | """ 675 | 676 | val_map = {} 677 | for val in series.unique(): 678 | val_map[val] = function(val) 679 | 680 | return series.map(val_map) 681 | 682 | 683 | def multiprocess_group_apply(grp, func, *args, **kwargs): 684 | """ 685 | 686 | Apply arbitrary functions to groups or slices of a Pandas DataFrame using multiprocessing, to efficiently \ 687 | map or aggregate data. Each group gets processed in parallel, and the results are concatenated together after \ 688 | all processing has finished. If you pass a function that aggregates each group into a single value, you'll get \ 689 | back a DataFrame with one row for each group, as though you had performed a `.agg` function. \ 690 | If you pass a function that returns a value for each _row_ in the group, then you'll get back a DataFrame \ 691 | in your original shape. In this case, you would simply be using grouping to efficiently apply a row-level operation. 692 | 693 | :param grp: A Pandas DataFrameGroupBy object 694 | :type grp: pandas.core.groupby.generic.DataFrameGroupBy 695 | :param func: A function that accepts a Pandas DataFrame representing a group from the original DataFrame 696 | :type func: function 697 | :param args: Arguments to be passed to the function 698 | :param kwargs: Keyword arguments to be passed to the function 699 | :return: The resulting DataFrame 700 | :rtype: pandas.DataFrame 701 | 702 | Usage:: 703 | 704 | df = pd.DataFrame([ 705 | {"group": 1, "value": "one two three"}, 706 | {"group": 1, "value": "one two three four"}, 707 | {"group": 2, "value": "one two"} 708 | ]) 709 | 710 | ### For efficient aggregation 711 | 712 | def get_length(grp): 713 | # Simple function that returns the number of rows in each group 714 | return len(grp) 715 | 716 | >>> df.groupby("group_col").apply(lambda x: len(x)) 717 | 1 2 718 | 2 1 719 | dtype: int64 720 | >>> multiprocess_group_apply(df.groupby("group_col"), get_length) 721 | 1 2 722 | 2 1 723 | dtype: int64 724 | 725 | ### For efficient mapping 726 | 727 | def get_value_length(grp): 728 | # Simple function that returns the word count of each row in the group 729 | return grp['value'].map(lambda x: len(x.split())) 730 | 731 | >>> df['value'].map(lambda x: len(x.split())) 732 | 0 3 733 | 1 4 734 | 2 2 735 | Name: value, dtype: int64 736 | >>> multiprocess_group_apply(df.groupby("group_col"), get_value_length) 737 | 0 3 738 | 1 4 739 | 2 2 740 | Name: value, dtype: int64 741 | 742 | # If you just want to efficiently map a function to your DataFrame and you want to evenly split your 743 | # DataFrame into groups, you could do the following: 744 | 745 | df["group_col"] = (df.reset_index().index.values / (len(df) / multiprocessing.cpu_count())).astype(int) 746 | df["mapped_value"] = multiprocess_group_apply(df.groupby("group_col"), get_value_length) 747 | del df["group_col"] 748 | 749 | """ 750 | 751 | results = [] 752 | pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) 753 | for name, group in grp: 754 | results.append(pool.apply_async(func, (group,) + args, kwargs)) 755 | pool.close() 756 | pool.join() 757 | results = [r.get() for r in results] 758 | 759 | if not hasattr(results[0], "__len__") or isinstance(results[0], str): 760 | # Assume it's an aggregation function 761 | return pd.Series(results, index=[g for g, _ in grp]) 762 | else: 763 | # Assume you're just mapping the function normally and using the groups to split the data 764 | return pd.concat(results) 765 | 766 | 767 | def extract_json_from_folder( 768 | folder_path, include_subdirs=False, concat_subdir_names=False 769 | ): 770 | 771 | """ 772 | Takes a folder path and traverses it, looking for JSON files. When it finds one, it adds it to a dictionary, 773 | with the key being the name of the file and the value being the JSON itself. This is useful if you store \ 774 | configurations or various metadata in a nested folder structure, which we do for things like content analysis \ 775 | codebooks. 776 | 777 | | Has options for recursively traversing a folder, and for optionally concatenating the subfolder names \ 778 | into the dictionary keys as prefixes. 779 | 780 | :param folder_path: The path of the folder to scan 781 | :type folder_path: str 782 | :param include_subdirs: Whether or not to recursively scan subfolders 783 | :type include_subdirs: bool 784 | :param concat_subdir_names: Whether or not to prefix the dictionary keys with the names of subfolders 785 | :type concat_subdir_names: bool 786 | :return: A dictionary containing all of the abstracted JSON files as values 787 | :rtype: dict 788 | 789 | Usage:: 790 | 791 | # For example, let's say we have the following folder structure 792 | # with various JSON codebooks scattered about: 793 | # 794 | # /codebooks 795 | # /logos 796 | # /antipathy.json 797 | # /atp_open_ends 798 | # /w29 799 | # /sources_of_meaning.json 800 | # 801 | # Here's what we'd get depending on the different parameters we use: 802 | 803 | from pewtils import extract_json_from_folder 804 | >>> extract_json_from_folder("codebooks", include_subdirs=False, concat_subdir_names=False) 805 | {} 806 | >>> extract_json_from_folder("codebooks", include_subdirs=True, concat_subdir_names=False) 807 | { 808 | "logos": {"antipathy": "json would be here"}, 809 | "atp_open_ends": {"w29": {"sources_of_meaning": "json would be here"}} 810 | } 811 | >>> extract_json_from_folder("codebooks", include_subdirs=True, concat_subdir_names=True) 812 | { 813 | "logos_antipathy": "json would be here", 814 | "atp_open_ends_w29_sources_of_meaning": "json would be here" 815 | } 816 | """ 817 | 818 | attributes = {} 819 | subdirs = [] 820 | if os.path.exists(folder_path): 821 | for path, subdir, files in walk(folder_path): 822 | if folder_path == path: 823 | for file in files: 824 | if file.endswith(".json"): 825 | key = re.sub(".json", "", file) 826 | with closing(open(os.path.join(path, file), "r")) as infile: 827 | try: 828 | attributes[key] = json.load(infile) 829 | except ValueError: 830 | print("JSON file is invalid: {}".format(file)) 831 | if subdir: 832 | subdirs.append(subdir) 833 | 834 | if include_subdirs and len(subdirs) > 0: 835 | for subdir in subdirs[0]: 836 | if subdir != "__pycache__": 837 | results = extract_json_from_folder( 838 | os.path.join(folder_path, subdir), 839 | include_subdirs=True, 840 | concat_subdir_names=concat_subdir_names, 841 | ) 842 | if not concat_subdir_names: 843 | attributes[subdir] = results 844 | else: 845 | for subattr_name, subattr in results.items(): 846 | attributes["_".join([subdir, subattr_name])] = subattr 847 | 848 | return attributes 849 | 850 | 851 | def extract_attributes_from_folder_modules( 852 | folder_path, 853 | attribute_name, 854 | include_subdirs=False, 855 | concat_subdir_names=False, 856 | current_subdirs=None, 857 | ): 858 | 859 | """ 860 | Takes a folder path and traverses it, looking for Python files that contain an attribute (i.e., class, function, 861 | etc.) with a given name. It extracts those attributes and returns a dictionary where the keys are the names of the 862 | files that contained the attributes, and the values are the attributes themselves. 863 | 864 | This operates exactly the same as :py:func:`pewtils.extract_json_from_folder` except instead of reading JSON files 865 | and adding them as values in the dictionary that gets returned, this function will instead look for Python files 866 | that contain a function, class, method, or attribute with the name you provide in ``attribute_name`` and will load 867 | that attribute in as the values. 868 | 869 | :param folder_path: The path of a folder/module to scan 870 | :type folder_path: str 871 | :param attribute_name: The name of the attribute (class, function, variable, etc.) to extract from files 872 | :type attribute_name: str 873 | :param include_subdirs: Whether or not to recursively scan subfolders 874 | :type include_subdirs: bool 875 | :param concat_subdir_names: Whether or not to prefix the dictionary keys with the names of subfolders 876 | :type concat_subdir_names: bool 877 | :param current_subdirs: Used to track location when recursively iterating a module (do not use) 878 | :return: A dictionary with all of the extracted attributes as values 879 | :rtype: dict 880 | 881 | .. note:: if you use Python 2.7 you will need to add ``from __future__ import absolute_import`` to the top of files \ 882 | that you want to scan and import using this function. 883 | """ 884 | 885 | if not folder_path.startswith(os.getcwd()): 886 | folder_path = os.path.join(os.getcwd(), folder_path) 887 | test_path, _ = os.path.split(folder_path) 888 | while test_path != "/": 889 | if "__init__.py" not in os.listdir(test_path): 890 | break 891 | test_path, _ = os.path.split(test_path) 892 | module_location = test_path 893 | 894 | current_folder = folder_path.split("/")[-1] 895 | if not current_subdirs: 896 | current_subdirs = [] 897 | 898 | attributes = {} 899 | subdirs = [] 900 | if os.path.exists(folder_path): 901 | for path, subdir_list, files in walk(folder_path): 902 | if folder_path == path: 903 | for file in files: 904 | if file.endswith(".py") and not file.startswith("__init__"): 905 | file_name = file.split(".")[0] 906 | module_name = re.sub( 907 | "/", 908 | ".", 909 | re.sub( 910 | module_location, 911 | "", 912 | os.path.splitext(os.path.join(path, file))[0], 913 | ), 914 | ).strip(".") 915 | if module_name in sys.modules: 916 | module = sys.modules[module_name] 917 | # https://github.com/ansible/ansible/issues/13110 918 | else: 919 | try: 920 | module = SourceFileLoader( 921 | module_name, os.path.join(path, file) 922 | ).load_module() 923 | except NameError: 924 | file, pathname, description = imp.find_module( 925 | file_name, [path] 926 | ) 927 | warnings.simplefilter("error", RuntimeWarning) 928 | try: 929 | module = imp.load_module( 930 | module_name, file, pathname, description 931 | ) 932 | except RuntimeWarning: 933 | try: 934 | module = imp.load_module( 935 | module_name.split(".")[-1], 936 | file, 937 | pathname, 938 | description, 939 | ) 940 | except RuntimeWarning: 941 | module = None 942 | except (ImportError, AttributeError): 943 | module = None 944 | except (ImportError, AttributeError): 945 | module = None 946 | if hasattr(module, attribute_name): 947 | attributes[file_name] = getattr(module, attribute_name) 948 | 949 | if subdir_list: 950 | subdirs.extend(subdir_list) 951 | 952 | if include_subdirs: 953 | for subdir in set(subdirs): 954 | results = extract_attributes_from_folder_modules( 955 | os.path.join(folder_path, subdir), 956 | attribute_name, 957 | concat_subdir_names=concat_subdir_names, 958 | include_subdirs=True, 959 | current_subdirs=current_subdirs + [current_folder], 960 | ) 961 | if not concat_subdir_names: 962 | attributes[subdir] = results 963 | else: 964 | for subattr_name, subattr in results.items(): 965 | attributes["_".join([subdir, subattr_name])] = subattr 966 | 967 | if is_null(current_subdirs, empty_lists_are_null=True): 968 | for name in attributes.keys(): 969 | try: 970 | attributes[name]._name = name 971 | except AttributeError: 972 | pass 973 | 974 | return attributes 975 | 976 | 977 | class timeout_wrapper: 978 | def __init__(self, seconds=1, error_message="Timeout"): 979 | """ 980 | Context manager that will raise an error if it takes longer than the specified number of seconds to execute. 981 | Found via this very helpful Stack Overflow post: 982 | https://stackoverflow.com/questions/2281850/timeout-function-if-it-takes-too-long-to-finish 983 | 984 | :param seconds: Number of seconds allowed for the code to execute 985 | :param error_message: Optional custom error message to raise 986 | """ 987 | self.seconds = seconds 988 | self.error_message = error_message 989 | 990 | def handle_timeout(self, signum, frame): 991 | raise Exception(self.error_message) 992 | 993 | def __enter__(self): 994 | signal.signal(signal.SIGALRM, self.handle_timeout) 995 | signal.alarm(self.seconds) 996 | 997 | def __exit__(self, t, value, traceback): 998 | signal.alarm(0) 999 | 1000 | 1001 | class PrintExecutionTime(object): 1002 | 1003 | """ 1004 | Simple context manager to print the time it takes for a block of code to execute 1005 | 1006 | :param label: A label to print alongside the execution time 1007 | :param stdout: a StringIO-like output stream (sys.stdout by default) 1008 | 1009 | Usage:: 1010 | 1011 | from pewtils import PrintExecutionTime 1012 | 1013 | >>> with PrintExecutionTime(label="my function"): time.sleep(5) 1014 | my function: 5.004292011260986 seconds 1015 | 1016 | """ 1017 | 1018 | def __init__(self, label=None, stdout=None): 1019 | self.start_time = None 1020 | self.end_time = None 1021 | self.label = label 1022 | self.stdout = sys.stdout if not stdout else stdout 1023 | 1024 | def __enter__(self): 1025 | self.start_time = time.time() 1026 | return self 1027 | 1028 | def __exit__(self, exc_type, exc_value, exc_traceback): 1029 | self.end_time = time.time() 1030 | if self.label: 1031 | self.stdout.write( 1032 | "{}: {} seconds".format(self.label, self.end_time - self.start_time) 1033 | ) 1034 | else: 1035 | self.stdout.write("{} seconds".format(self.end_time - self.start_time)) 1036 | --------------------------------------------------------------------------------