├── .codeclimate.yml ├── .editorconfig ├── .github └── workflows │ ├── deploy.yaml │ └── testing.yaml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── VERSION ├── docs ├── Makefile ├── _static │ └── css │ │ └── custom.css ├── conf.py ├── index.rst ├── make.bat ├── mxrecords.rst ├── normalize.rst ├── normalizer.rst ├── requirements.txt └── result.rst ├── email_normalize ├── __init__.py ├── providers.py └── py.typed ├── setup.cfg ├── setup.py └── tests ├── __init__.py ├── test_normalize.py └── test_normalizer.py /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | languages: 2 | Python: true 3 | exclude_paths: 4 | - tests.py 5 | - setup.py 6 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | trim_trailing_whitespace = true 7 | 8 | [*.py] 9 | indent_style = space 10 | indent_size = 4 11 | 12 | [.travis.yml] 13 | indent_style = space 14 | indent_size = 2 15 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yaml: -------------------------------------------------------------------------------- 1 | name: Deployment 2 | on: 3 | push: 4 | branches-ignore: ["*"] 5 | tags: ["*"] 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && github.repository == 'gmr/email-normalize' 10 | container: python:3.8-alpine 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v1 14 | - name: Build package 15 | run: python3 setup.py sdist 16 | - name: Publish package 17 | uses: pypa/gh-action-pypi-publish@master 18 | with: 19 | user: __token__ 20 | password: ${{ secrets.PYPI_PASSWORD }} 21 | -------------------------------------------------------------------------------- /.github/workflows/testing.yaml: -------------------------------------------------------------------------------- 1 | name: Testing 2 | on: 3 | push: 4 | branches: ["*"] 5 | paths-ignore: 6 | - 'docs/**' 7 | - 'setup.*' 8 | - '*.md' 9 | - '*.rst' 10 | tags-ignore: ["*"] 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | timeout-minutes: 3 15 | strategy: 16 | matrix: 17 | python: [3.7, 3.8, 3.9] 18 | container: 19 | image: python:${{ matrix.python }}-alpine 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v1 23 | 24 | - name: Install OS dependencies 25 | run: apk --update add gcc libffi-dev linux-headers make musl-dev 26 | 27 | - name: Install testing dependencies 28 | run: pip3 --no-cache-dir install -e '.[testing]' 29 | 30 | - name: Create build directory 31 | run: mkdir build 32 | 33 | - name: Run flake8 tests 34 | run: flake8 35 | 36 | - name: Run tests 37 | run: coverage run 38 | 39 | - name: Output coverage 40 | run: coverage report && coverage xml 41 | 42 | - name: Upload Coverage 43 | uses: codecov/codecov-action@v1.0.2 44 | if: github.event_name == 'push' && github.repository == 'gmr/email-normalize' 45 | with: 46 | token: ${{secrets.CODECOV_TOKEN}} 47 | file: build/coverage.xml 48 | flags: unittests 49 | fail_ci_if_error: true 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | env2/ 12 | env3/ 13 | bin/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # Installer logs 28 | pip-log.txt 29 | pip-delete-this-directory.txt 30 | 31 | # Unit test / coverage reports 32 | htmlcov/ 33 | .tox/ 34 | .coverage 35 | .cache 36 | nosetests.xml 37 | coverage.xml 38 | 39 | # Translations 40 | *.mo 41 | 42 | # Mr Developer 43 | .mr.developer.cfg 44 | .project 45 | .pydevproject 46 | 47 | # Rope 48 | .ropeproject 49 | 50 | # Django stuff: 51 | *.log 52 | *.pot 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | .idea 58 | .vagrant 59 | 60 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | To get setup in the environment and run the tests, take the following steps: 4 | 5 | ```bash 6 | python3 -m venv env 7 | source env/bin/activate 8 | pip install -e '.[test]' 9 | 10 | flake8 11 | coverage run && coverage report 12 | ``` 13 | 14 | ## Adding a Mailbox Provider 15 | 16 | If you know the features for a mailbox provider, simply modify 17 | `email_normalize.providers` adding a new class for the provider. 18 | Set the flags in the new class appropriately and add tests. 19 | 20 | ## Test Coverage 21 | 22 | Pull requests that make changes or additions that are not covered by tests 23 | will likely be closed without review. 24 | 25 | In addition, all tests must pass the tests **AND** flake8 linter. If flake8 26 | exceptions are included, the reasoning for adding the exception must be included 27 | in the pull request. 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015-2020 Gavin M. Roy 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of the copyright holder nor the names of its contributors may 13 | be used to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 20 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 21 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 23 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 | OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 25 | ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CONTRIBUTING.md 2 | include LICENSE 3 | include README.rst 4 | include VERSION 5 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | email-normalize 2 | =============== 3 | ``email-normalize`` is a Python 3 library for returning a normalized email-address 4 | stripping mailbox provider specific behaviors such as "Plus addressing" 5 | (foo+bar@gmail.com). 6 | 7 | |Version| |Status| |Coverage| |License| 8 | 9 | Example 10 | ------- 11 | 12 | .. code:: python 13 | 14 | import email_normalize 15 | 16 | # Returns ``foo@gmail.com`` 17 | normalized = email_normalize.normalize('f.o.o+bar@gmail.com') 18 | 19 | Currently Supported Mailbox Providers 20 | ------------------------------------- 21 | - Apple 22 | - Fastmail 23 | - Google 24 | - Microsoft 25 | - ProtonMail 26 | - Rackspace 27 | - Yahoo 28 | - Yandex 29 | - Zoho 30 | 31 | Documentation 32 | ------------- 33 | http://email-normalize.readthedocs.org 34 | 35 | Python Versions Supported 36 | ------------------------- 37 | 3.7+ 38 | 39 | .. |Version| image:: https://img.shields.io/pypi/v/email-normalize.svg? 40 | :target: https://pypi.python.org/pypi/email-normalize 41 | 42 | .. |Status| image:: https://github.com/gmr/email-normalize/workflows/Testing/badge.svg? 43 | :target: https://github.com/gmr/email-normalize/actions?workflow=Testing 44 | :alt: Build Status 45 | 46 | .. |Coverage| image:: https://img.shields.io/codecov/c/github/gmr/email-normalize.svg? 47 | :target: https://codecov.io/github/gmr/email-normalize?branch=master 48 | 49 | .. |License| image:: https://img.shields.io/pypi/l/email-normalize.svg? 50 | :target: https://email-normalize.readthedocs.org 51 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 2.0.0 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | .md-logo { 2 | } 3 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pkg_resources 4 | import sphinx_material 5 | 6 | html_theme = 'sphinx_material' 7 | html_theme_path = sphinx_material.html_theme_path() 8 | html_context = sphinx_material.get_html_context() 9 | html_sidebars = { 10 | "**": ["globaltoc.html", "searchbox.html"] 11 | } 12 | html_theme_options = { 13 | 'base_url': 'http://email-normalize.readthedocs.io', 14 | 'repo_url': 'https://github.com/gmr/email-normalize/', 15 | 'repo_name': 'email-normalize', 16 | 'html_minify': True, 17 | 'css_minify': True, 18 | 'nav_title': 'email-normalize', 19 | 'globaltoc_depth': 2, 20 | 'theme_color': '0000aa', 21 | 'logo_icon': '⊆', 22 | 'color_primary': 'grey', 23 | 'color_accent': 'blue', 24 | 'version_dropdown': False 25 | } 26 | html_static_path = ['_static'] 27 | html_css_files = [ 28 | 'css/custom.css' 29 | ] 30 | 31 | master_doc = 'index' 32 | project = 'email-normalize' 33 | release = version = pkg_resources.get_distribution(project).version 34 | copyright = '2015-{}, Gavin M. Roy'.format(datetime.date.today().year) 35 | 36 | extensions = [ 37 | 'sphinx.ext.autodoc', 38 | 'sphinx_autodoc_typehints', 39 | 'sphinx.ext.intersphinx', 40 | 'sphinx.ext.viewcode', 41 | 'sphinx_material' 42 | ] 43 | 44 | set_type_checking_flag = True 45 | typehints_fully_qualified = True 46 | always_document_param_types = True 47 | typehints_document_rtype = True 48 | 49 | templates_path = ['_templates'] 50 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 51 | intersphinx_mapping = {'python': ('https://docs.python.org/3', None)} 52 | 53 | autodoc_default_options = {'autodoc_typehints': 'description'} 54 | 55 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | email-normalize 2 | =============== 3 | ``email-normalize`` is a Python 3 library for returning a normalized email-address 4 | stripping mailbox provider specific behaviors such as "Plus addressing" 5 | (foo+bar@gmail.com). 6 | 7 | The email-normalize API has two primary components: a single function, 8 | :func:`email_normalize.normalize` and the :class:`email_normalize.Normalizer` 9 | class. Both use Python's :py:mod:`asyncio` library. 10 | 11 | The :func:`~email_normalize.normalize` function is intended for 12 | use in non-async applications and the :class:`~email_normalize.Normalizer` is 13 | intended for async applications. :func:`~email_normalize.normalize` uses 14 | :class:`~email_normalize.Normalizer` under the hood. 15 | 16 | Documentation 17 | ------------- 18 | .. toctree:: 19 | :maxdepth: 1 20 | 21 | normalize 22 | normalizer 23 | mxrecords 24 | result 25 | 26 | Currently Supported Mailbox Providers 27 | ------------------------------------- 28 | - Apple 29 | - Fastmail 30 | - Google 31 | - Microsoft 32 | - ProtonMail 33 | - Rackspace 34 | - Yahoo 35 | - Yandex 36 | - Zoho 37 | 38 | Installation 39 | ------------ 40 | email-normalize is available via the `Python Package Index `_. 41 | 42 | .. code:: 43 | 44 | pip3 install email-normalize 45 | 46 | Indices and tables 47 | ================== 48 | 49 | * :ref:`genindex` 50 | * :ref:`modindex` 51 | * :ref:`search` 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/mxrecords.rst: -------------------------------------------------------------------------------- 1 | MXRecords Type 2 | ============== 3 | 4 | .. data:: email_normalize.MXRecords 5 | 6 | A typing alias for list of tuples containing the priority and host name for 7 | each record returned during the MX lookup. 8 | 9 | .. code-block:: python 10 | 11 | typing.List[typing.Tuple[int, str]] 12 | 13 | **Example** 14 | 15 | .. code-block:: python 16 | 17 | [ 18 | (5, 'gmail-smtp-in.l.google.com'), 19 | (10, 'alt1.gmail-smtp-in.l.google.com'), 20 | (20, 'alt2.gmail-smtp-in.l.google.com'), 21 | (30, 'alt3.gmail-smtp-in.l.google.com'), 22 | (40, 'alt4.gmail-smtp-in.l.google.com') 23 | ] 24 | -------------------------------------------------------------------------------- /docs/normalize.rst: -------------------------------------------------------------------------------- 1 | normalize Function 2 | ================== 3 | 4 | .. autofunction:: email_normalize.normalize 5 | -------------------------------------------------------------------------------- /docs/normalizer.rst: -------------------------------------------------------------------------------- 1 | Normalizer Class 2 | ================ 3 | 4 | .. autoclass:: email_normalize.Normalizer 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | sphinx-autodoc-typehints 3 | sphinx-material==0.0.30 4 | typed_ast 5 | -------------------------------------------------------------------------------- /docs/result.rst: -------------------------------------------------------------------------------- 1 | Result Class 2 | ============ 3 | 4 | .. autoclass:: email_normalize.Result 5 | :members: 6 | -------------------------------------------------------------------------------- /email_normalize/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | email-normalize 3 | =============== 4 | 5 | Library for returning a normalized email-address stripping mailbox provider 6 | specific behaviors such as "Plus addressing" (foo+bar@gmail.com). 7 | 8 | """ 9 | import asyncio 10 | import copy 11 | import dataclasses 12 | import logging 13 | import operator 14 | import time 15 | import typing 16 | from email import utils 17 | 18 | import aiodns 19 | from aiodns import error 20 | 21 | from email_normalize import providers 22 | 23 | LOGGER = logging.getLogger(__name__) 24 | 25 | MXRecords = typing.List[typing.Tuple[int, str]] 26 | 27 | cache: typing.Dict[str, 'CachedItem'] = {} 28 | 29 | 30 | class CachedItem: 31 | """Used to represent a cached lookup for implementing a LFRU cache""" 32 | __slots__ = ['cached_at', 'hits', 'last_access', 'mx_records', 'ttl'] 33 | 34 | def __init__(self, mx_records: MXRecords, ttl: int): 35 | self.cached_at = time.monotonic() 36 | self.hits = 0 37 | self.last_access: float = 0.0 38 | self.mx_records = mx_records 39 | self.ttl = ttl 40 | 41 | @property 42 | def expired(self): 43 | return (time.monotonic() - self.cached_at) > self.ttl 44 | 45 | 46 | @dataclasses.dataclass(frozen=True) 47 | class Result: 48 | """Instances of the :class:`~email_normalize.Result` class contain data 49 | from the email normalization process. 50 | 51 | :param address: The address that was normalized 52 | :type address: str 53 | :param normalized_address: The normalized version of the address 54 | :type normalized_address: str 55 | :param mx_records: A list of tuples representing the priority and host of 56 | the MX records found for the email address. If empty, indicates a 57 | failure to lookup the domain part of the email address. 58 | :type mx_records: :data:`~email_normalize.MXRecords` 59 | :param mailbox_provider: String that represents the mailbox provider name 60 | - is `None` if the mailbox provider could not be detected or 61 | was unsupported. 62 | :type mailbox_provider: str 63 | 64 | .. note:: If during the normalization process the MX records could not be 65 | resolved, the ``mx_records`` attribute will be an empty :class:`list` 66 | and the ``mailbox_provider`` attribute will be :data:`None`. 67 | 68 | **Example** 69 | 70 | .. code-block:: python 71 | 72 | @dataclasses.dataclass(frozen=True) 73 | class Result: 74 | address = 'Gavin.M.Roy+ignore-spam@gmail.com' 75 | normalized_address = 'gavinmroy@gmail.com' 76 | mx_records = [ 77 | (5, 'gmail-smtp-in.l.google.com'), 78 | (10, 'alt1.gmail-smtp-in.l.google.com'), 79 | (20, 'alt2.gmail-smtp-in.l.google.com'), 80 | (30, 'alt3.gmail-smtp-in.l.google.com'), 81 | (40, 'alt4.gmail-smtp-in.l.google.com') 82 | ] 83 | mailbox_provider = 'Gmail' 84 | 85 | """ 86 | address: str 87 | normalized_address: str 88 | mx_records: MXRecords 89 | mailbox_provider: typing.Optional[str] = None 90 | 91 | 92 | class Normalizer: 93 | """Class for normalizing an email address and resolving MX records. 94 | 95 | Normalization is processed by splitting the local and domain parts of the 96 | email address and then performing DNS resolution for the MX records 97 | associated with the domain part of the address. The MX records are 98 | processed against a set of mailbox provider specific rules. If a match 99 | is found for the MX record hosts, the rules are applied to the email 100 | address. 101 | 102 | This class implements a least frequent recently used cache that respects 103 | the DNS TTL returned when performing MX lookups. Data is cached at the 104 | **module** level. 105 | 106 | **Usage Example** 107 | 108 | .. code-block:: python 109 | 110 | async def normalize(email_address: str) -> email_normalize.Result: 111 | normalizer = email_normalize.Normalizer() 112 | return await normalizer.normalize('foo@bar.io') 113 | 114 | :param name_servers: Optional list of hostnames to use for DNS resolution 115 | :type name_servers: list(str) or None 116 | :param int cache_limit: The maximum number of domain results that are 117 | cached. Defaults to `1024`. 118 | 119 | :param bool cache_failures: Toggle the behavior of caching DNS resolution 120 | failures for a given domain. When enabled, failures will be cached 121 | for `failure_ttl` seconds. Defaults to `True`. 122 | :param int failure_ttl: Duration in seconds to cache DNS failures. Only 123 | works when `cache_failures` is set to `True`. Defaults to `300` 124 | seconds. 125 | 126 | """ 127 | 128 | def __init__(self, 129 | name_servers: typing.Optional[typing.List[str]] = None, 130 | cache_limit: int = 1024, 131 | cache_failures: bool = True, 132 | failure_ttl: int = 300) -> 'Normalizer': 133 | self._resolver = aiodns.DNSResolver(name_servers) 134 | self.cache_failures = cache_failures 135 | self.cache_limit = cache_limit 136 | self.failure_ttl = failure_ttl 137 | 138 | async def mx_records(self, domain_part: str) -> MXRecords: 139 | """Resolve MX records for a domain returning a list of tuples with the 140 | MX priority and value. 141 | 142 | :param domain_part: The domain to resolve MX records for 143 | :type domain_part: str 144 | :rtype: :data:`~email_normalize.MXRecords` 145 | 146 | """ 147 | if self._skip_cache(domain_part): 148 | try: 149 | records = await self._resolver.query(domain_part, 'MX') 150 | except error.DNSError as err: 151 | LOGGER.debug('Failed to resolve %r: %s', domain_part, err) 152 | if not self.cache_failures: 153 | return [] 154 | mx_records, ttl = [], self.failure_ttl 155 | else: 156 | mx_records = [(r.priority, r.host) for r in records] 157 | ttl = min(r.ttl for r in records) \ 158 | if records else self.failure_ttl 159 | 160 | # Prune the cache if over the limit, finding least used, oldest 161 | if len(cache.keys()) >= self.cache_limit: 162 | key_to_prune = sorted( 163 | cache.items(), key=lambda i: ( 164 | i[1].hits, i[1].last_access))[0][0] 165 | LOGGER.debug('Pruning cache of %s', key_to_prune) 166 | del cache[key_to_prune] 167 | 168 | cache[domain_part] = CachedItem( 169 | sorted(mx_records, key=operator.itemgetter(0, 1)), ttl) 170 | 171 | cache[domain_part].hits += 1 172 | cache[domain_part].last_access = time.monotonic() 173 | return copy.deepcopy(cache[domain_part].mx_records) 174 | 175 | async def normalize(self, email_address: str) -> Result: 176 | """Return a :class:`~email_normalize.Result` instance containing the 177 | original address, the normalized address, the MX records found, and 178 | the detected mailbox provider. 179 | 180 | .. note:: If the MX records could not be resolved, the ``mx_records`` 181 | attribute of the result will be an empty :class:`list` and the 182 | ``mailbox_provider`` will be :data:`None`. 183 | 184 | :param email_address: The address to normalize 185 | :rtype: :class:`~email_normalize.Result` 186 | 187 | """ 188 | address = utils.parseaddr(email_address) 189 | local_part, domain_part = address[1].lower().split('@') 190 | mx_records = await self.mx_records(domain_part) 191 | provider = self._lookup_provider(mx_records) 192 | if provider: 193 | if provider.Flags & providers.Rules.LOCAL_PART_AS_HOSTNAME: 194 | local_part, domain_part = self._local_part_as_hostname( 195 | local_part, domain_part) 196 | if provider.Flags & providers.Rules.STRIP_PERIODS: 197 | local_part = local_part.replace('.', '') 198 | if provider.Flags & providers.Rules.PLUS_ADDRESSING: 199 | local_part = local_part.split('+')[0] 200 | if provider.Flags & providers.Rules.DASH_ADDRESSING: 201 | local_part = local_part.split('-')[0] 202 | return Result(email_address, '@'.join([local_part, domain_part]), 203 | mx_records, provider.__name__ if provider else None) 204 | 205 | @staticmethod 206 | def _local_part_as_hostname(local_part: str, 207 | domain_part: str) -> typing.Tuple[str, str]: 208 | domain_segments = domain_part.split('.') 209 | if len(domain_segments) > 2: 210 | local_part = domain_segments[0] 211 | domain_part = '.'.join(domain_segments[1:]) 212 | return local_part, domain_part 213 | 214 | @staticmethod 215 | def _lookup_provider(mx_records: typing.List[typing.Tuple[int, str]]) \ 216 | -> typing.Optional[providers.MailboxProvider]: 217 | for priority, host in mx_records: 218 | lchost = host.lower(); 219 | for provider in providers.Providers: 220 | for domain in provider.MXDomains: 221 | if lchost.endswith(domain): 222 | return provider 223 | 224 | def _skip_cache(self, domain: str) -> bool: 225 | if domain not in cache: 226 | return True 227 | elif cache[domain].expired: 228 | del cache[domain] 229 | return True 230 | return False 231 | 232 | 233 | def normalize(email_address: str) -> Result: 234 | """Normalize an email address 235 | 236 | This method abstracts the :mod:`asyncio` base for this library and 237 | provides a blocking function. If you intend to use this library as part of 238 | an :mod:`asyncio` based application, it is recommended that you use 239 | the :meth:`~email_normalize.Normalizer.normalize` instead. 240 | 241 | .. note:: If the MX records could not be resolved, the ``mx_records`` 242 | attribute of the result will be an empty :class:`list` and the 243 | ``mailbox_provider`` attribute will be :data:`None`. 244 | 245 | **Usage Example** 246 | 247 | .. code-block:: python 248 | 249 | import email_normalize 250 | 251 | result = email_normalize.normalize('foo@bar.io') 252 | 253 | :param email_address: The address to normalize 254 | 255 | """ 256 | loop = asyncio.get_event_loop() 257 | normalizer = Normalizer() 258 | return loop.run_until_complete(normalizer.normalize(email_address)) 259 | -------------------------------------------------------------------------------- /email_normalize/providers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provider Specific Rules 3 | """ 4 | import enum 5 | import typing 6 | 7 | 8 | class Rules(enum.Flag): 9 | """Represents what features a mailbox provider supports in dynamic 10 | aliasing of email addresses. 11 | 12 | Used to determine how to normalize provider specific email addresses. 13 | 14 | """ 15 | DASH_ADDRESSING = enum.auto() 16 | PLUS_ADDRESSING = enum.auto() 17 | LOCAL_PART_AS_HOSTNAME = enum.auto() 18 | STRIP_PERIODS = enum.auto() 19 | 20 | 21 | class MailboxProvider: 22 | """Base class to define the contract for the mail providers""" 23 | Flags: Rules 24 | MXDomains: typing.Set[str] 25 | 26 | 27 | class Apple(MailboxProvider): 28 | Flags: Rules = Rules.PLUS_ADDRESSING 29 | MXDomains: typing.Set[str] = {'icloud.com'} 30 | 31 | 32 | class Fastmail(MailboxProvider): 33 | Flags: Rules = Rules.PLUS_ADDRESSING ^ Rules.LOCAL_PART_AS_HOSTNAME 34 | MXDomains: typing.Set[str] = {'messagingengine.com'} 35 | 36 | 37 | class Google(MailboxProvider): 38 | Flags: Rules = Rules.PLUS_ADDRESSING ^ Rules.STRIP_PERIODS 39 | MXDomains: typing.Set[str] = {'google.com', 'googlemail.com'} 40 | 41 | 42 | class Microsoft(MailboxProvider): 43 | Flags: Rules = Rules.PLUS_ADDRESSING 44 | MXDomains: typing.Set[str] = {'outlook.com'} 45 | 46 | 47 | class ProtonMail(MailboxProvider): 48 | Flags: Rules = Rules.PLUS_ADDRESSING 49 | MXDomains: typing.Set[str] = {'protonmail.ch'} 50 | 51 | 52 | class Rackspace(MailboxProvider): 53 | Flags: Rules = Rules.PLUS_ADDRESSING 54 | MXDomains: typing.Set[str] = {'emailsrvr.com'} 55 | 56 | 57 | class Yahoo(MailboxProvider): 58 | Flags: Rules = Rules.DASH_ADDRESSING ^ Rules.STRIP_PERIODS 59 | MXDomains: typing.Set[str] = {'yahoodns.net'} 60 | 61 | 62 | class Yandex(MailboxProvider): 63 | Flags: Rules = Rules.PLUS_ADDRESSING 64 | MXDomains: typing.Set[str] = {'mx.yandex.net', 'yandex.ru'} 65 | 66 | 67 | class Zoho(MailboxProvider): 68 | Flags: Rules = Rules.PLUS_ADDRESSING 69 | MXDomains: typing.Set[str] = {'zoho.com'} 70 | 71 | 72 | Providers = [ 73 | Apple, 74 | Fastmail, 75 | Google, 76 | Microsoft, 77 | ProtonMail, 78 | Rackspace, 79 | Yahoo, 80 | Yandex, 81 | Zoho 82 | ] 83 | -------------------------------------------------------------------------------- /email_normalize/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmr/email-normalize/9a3a18a60ef8d1822255f8ee4a31ccba8fd03d5a/email_normalize/py.typed -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = email-normalize 3 | version = file: VERSION 4 | description = Return a normalized email-address stripping ISP specific behaviors 5 | long_description = file: README.rst 6 | long_description_content_type = text/x-rst; charset=UTF-8 7 | license = BSD 3-Clause License 8 | license-file = LICENSE 9 | home-page = https://github.com/gmr/email-normalize 10 | project_urls = 11 | Bug Tracker = https://github.com/gmr/email-normalize/issues 12 | Documentation = https://email-normalize.readthedocs.io 13 | Source Code = https://github.com/gmr/email-normalize/ 14 | author = Gavin M. Roy 15 | author_email = gavinmroy@gmail.com 16 | classifiers = 17 | Development Status :: 5 - Production/Stable 18 | Intended Audience :: Developers 19 | Framework :: AsyncIO 20 | License :: OSI Approved :: BSD License 21 | Natural Language :: English 22 | Operating System :: OS Independent 23 | Programming Language :: Python :: 3 24 | Programming Language :: Python :: 3.7 25 | Programming Language :: Python :: 3.8 26 | Programming Language :: Python :: 3.9 27 | Topic :: Communications 28 | Topic :: Communications :: Email 29 | Topic :: Internet 30 | Topic :: Software Development 31 | Typing :: Typed 32 | requires-dist = setuptools 33 | keywords = 34 | email 35 | 36 | [options] 37 | include_package_data = True 38 | install_requires = 39 | aiodns 40 | packages = 41 | email_normalize 42 | zip_safe = true 43 | 44 | [options.extras_require] 45 | testing = 46 | asynctest 47 | coverage 48 | flake8 49 | flake8-comprehensions 50 | flake8-deprecated 51 | flake8-import-order 52 | flake8-print 53 | flake8-quotes 54 | flake8-rst-docstrings 55 | flake8-tuple 56 | pygments 57 | 58 | [coverage:run] 59 | branch = True 60 | command_line = -m unittest discover tests --verbose 61 | data_file = build/.coverage 62 | 63 | [coverage:report] 64 | show_missing = True 65 | include = 66 | email_normalize/* 67 | omit = 68 | tests/*.py 69 | 70 | [coverage:html] 71 | directory = build/coverage 72 | 73 | [coverage:xml] 74 | output = build/coverage.xml 75 | 76 | [flake8] 77 | application-import-names = email_normalize, tests 78 | exclude = build,docs,env 79 | ignore = RST306,RST399,W503 80 | import-order-style = pycharm 81 | rst-directives = seealso 82 | rst-roles = attr,class,const,data,exc,func,meth,mod,obj,ref,yields 83 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import setuptools 3 | 4 | setuptools.setup() 5 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gmr/email-normalize/9a3a18a60ef8d1822255f8ee4a31ccba8fd03d5a/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_normalize.py: -------------------------------------------------------------------------------- 1 | import typing 2 | import unittest 3 | import uuid 4 | import warnings 5 | 6 | from asynctest import mock 7 | 8 | import email_normalize 9 | 10 | 11 | class TestCase(unittest.TestCase): 12 | 13 | @classmethod 14 | def setUpClass(cls) -> None: 15 | warnings.simplefilter('ignore') 16 | 17 | 18 | class InvalidDomainTestCase(TestCase): 19 | 20 | def test_invalid_domain_part(self): 21 | address = '{}@{}'.format(uuid.uuid4(), uuid.uuid4()) 22 | result = email_normalize.normalize(address) 23 | self.assertIsInstance(result, email_normalize.Result) 24 | self.assertEqual(result.address, address) 25 | self.assertEqual(result.normalized_address, address) 26 | self.assertListEqual(result.mx_records, []) 27 | self.assertIsNone(result.mailbox_provider) 28 | 29 | 30 | class MailboxProviderTestCase(TestCase): 31 | 32 | def _perform_test(self, 33 | address: str, 34 | normalized: str, 35 | mx_records: typing.List[typing.Tuple[int, str]], 36 | provider: typing.Optional[str]): 37 | with mock.patch('email_normalize.Normalizer.mx_records') as mxr: 38 | mxr.return_value = mx_records 39 | result = email_normalize.normalize(address) 40 | self.assertIsInstance(result, email_normalize.Result) 41 | self.assertEqual(result.address, address) 42 | self.assertEqual(result.normalized_address, normalized) 43 | self.assertListEqual(result.mx_records, mx_records) 44 | self.assertEqual(result.mailbox_provider, provider) 45 | 46 | def test_apple(self): 47 | local_part = str(uuid.uuid4()) 48 | domain_part = str(uuid.uuid4()) 49 | address = '{}+test@{}'.format(local_part, domain_part) 50 | mx_records = [(10, 'mx01.mail.icloud.com')] 51 | self._perform_test( 52 | address, '{}@{}'.format(local_part, domain_part), 53 | mx_records, 'Apple') 54 | 55 | def test_fastmail_plus_addressing(self): 56 | local_part = str(uuid.uuid4()) 57 | domain_part = str(uuid.uuid4()) 58 | address = '{}+test@{}'.format(local_part, domain_part) 59 | mx_records = [(10, 'in1-smtp.messagingengine.com')] 60 | self._perform_test( 61 | address, '{}@{}'.format(local_part, domain_part), 62 | mx_records, 'Fastmail') 63 | 64 | def test_fastmail_local_part_as_hostname(self): 65 | local_part = str(uuid.uuid4()) 66 | domain_part = '{}.com'.format(uuid.uuid4()) 67 | address = 'testing@{}.{}'.format(local_part, domain_part) 68 | mx_records = [(10, 'in1-smtp.messagingengine.com')] 69 | self._perform_test( 70 | address, '{}@{}'.format(local_part, domain_part), 71 | mx_records, 'Fastmail') 72 | 73 | def test_google(self): 74 | local_part = str(uuid.uuid4()).replace('-', '.') 75 | domain_part = str(uuid.uuid4()) 76 | address = '{}+test@{}'.format(local_part, domain_part) 77 | mx_records = [(1, 'aspmx.l.google.com')] 78 | self._perform_test( 79 | address, '{}@{}'.format(local_part.replace('.', ''), domain_part), 80 | mx_records, 'Google') 81 | 82 | def test_microsoft(self): 83 | local_part = str(uuid.uuid4()) 84 | domain_part = str(uuid.uuid4()) 85 | address = '{}+test@{}'.format(local_part, domain_part) 86 | mx_records = [(10, 'domain-com.mail.protection.outlook.com')] 87 | self._perform_test( 88 | address, '{}@{}'.format(local_part, domain_part), 89 | mx_records, 'Microsoft') 90 | 91 | def test_protonmail(self): 92 | local_part = str(uuid.uuid4()) 93 | domain_part = str(uuid.uuid4()) 94 | address = '{}+test@{}'.format(local_part, domain_part) 95 | mx_records = [(5, 'mail.protonmail.ch')] 96 | self._perform_test( 97 | address, '{}@{}'.format(local_part, domain_part), 98 | mx_records, 'ProtonMail') 99 | 100 | def test_rackspace(self): 101 | local_part = str(uuid.uuid4()) 102 | domain_part = str(uuid.uuid4()) 103 | address = '{}+test@{}'.format(local_part, domain_part) 104 | mx_records = [(10, 'mx1.emailsrvr.com')] 105 | self._perform_test( 106 | address, '{}@{}'.format(local_part, domain_part), 107 | mx_records, 'Rackspace') 108 | 109 | def test_yahoo(self): 110 | local_part = str(uuid.uuid4()) 111 | domain_part = str(uuid.uuid4()) 112 | address = '{}@{}'.format(local_part, domain_part) 113 | mx_records = [(1, 'mta5.am0.yahoodns.net')] 114 | self._perform_test( 115 | address, '{}@{}'.format(local_part.split('-', 1)[0], domain_part), 116 | mx_records, 'Yahoo') 117 | 118 | def test_yandex(self): 119 | local_part = str(uuid.uuid4()) 120 | domain_part = str(uuid.uuid4()) 121 | address = '{}+test@{}'.format(local_part, domain_part) 122 | mx_records = [(10, 'mx.yandex.net')] 123 | self._perform_test( 124 | address, '{}@{}'.format(local_part, domain_part), 125 | mx_records, 'Yandex') 126 | 127 | def test_zoho(self): 128 | local_part = str(uuid.uuid4()) 129 | domain_part = str(uuid.uuid4()) 130 | address = '{}+test@{}'.format(local_part, domain_part) 131 | mx_records = [(10, 'mx.zoho.com')] 132 | self._perform_test( 133 | address, '{}@{}'.format(local_part, domain_part), 134 | mx_records, 'Zoho') 135 | -------------------------------------------------------------------------------- /tests/test_normalizer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import functools 3 | import logging 4 | import operator 5 | import os 6 | import time 7 | import unittest 8 | import uuid 9 | 10 | import aiodns 11 | from asynctest import mock 12 | 13 | import email_normalize 14 | 15 | LOGGER = logging.getLogger(__name__) 16 | 17 | 18 | def async_test(*func): 19 | if func: 20 | @functools.wraps(func[0]) 21 | def wrapper(*args, **kwargs): 22 | LOGGER.debug('Starting test with loop %r', args[0]) 23 | args[0].loop.run_until_complete(func[0](*args, **kwargs)) 24 | LOGGER.debug('Test completed') 25 | return wrapper 26 | 27 | 28 | class AsyncTestCase(unittest.TestCase): 29 | 30 | def setUp(self) -> None: 31 | self.loop = asyncio.new_event_loop() 32 | asyncio.set_event_loop(self.loop) 33 | self.loop.set_debug(True) 34 | self.timeout = int(os.environ.get('ASYNC_TIMEOUT', '5')) 35 | self.timeout_handle = self.loop.call_later( 36 | self.timeout, self.on_timeout) 37 | self.resolver = aiodns.DNSResolver(loop=self.loop) 38 | self.normalizer = email_normalize.Normalizer() 39 | self.normalizer._resolver = self.resolver 40 | email_normalize.cache = {} 41 | 42 | def tearDown(self): 43 | LOGGER.debug('In AsyncTestCase.tearDown') 44 | if not self.timeout_handle.cancelled(): 45 | self.timeout_handle.cancel() 46 | self.loop.run_until_complete(self.loop.shutdown_asyncgens()) 47 | if self.loop.is_running: 48 | self.loop.close() 49 | super().tearDown() 50 | 51 | def on_timeout(self): 52 | self.loop.stop() 53 | raise TimeoutError( 54 | 'Test duration exceeded {} seconds'.format(self.timeout)) 55 | 56 | 57 | class NormalizerTestCase(AsyncTestCase): 58 | 59 | def setUp(self) -> None: 60 | super().setUp() 61 | if 'gmail.com' in email_normalize.cache: 62 | del email_normalize.cache['gmail.com'] 63 | 64 | @async_test 65 | async def test_mx_records(self): 66 | result = await self.resolver.query('gmail.com', 'MX') 67 | expectation = [] 68 | for record in result: 69 | expectation.append((record.priority, record.host)) 70 | expectation.sort(key=operator.itemgetter(0, 1)) 71 | self.assertListEqual( 72 | await self.normalizer.mx_records('gmail.com'), 73 | expectation) 74 | 75 | @async_test 76 | async def test_cache(self): 77 | await self.normalizer.mx_records('gmail.com') 78 | await self.normalizer.mx_records('gmail.com') 79 | self.assertEqual(email_normalize.cache['gmail.com'].hits, 2) 80 | del email_normalize.cache['gmail.com'] 81 | self.assertNotIn('gmail.com', email_normalize.cache) 82 | with self.assertRaises(KeyError): 83 | self.assertIsNone(email_normalize.cache['foo']) 84 | 85 | @async_test 86 | async def test_cache_max_size(self): 87 | for offset in range(0, self.normalizer.cache_limit): 88 | key = 'key-{}'.format(offset) 89 | email_normalize.cache[key] = email_normalize.CachedItem([], 60) 90 | email_normalize.cache[key].hits = 3 91 | email_normalize.cache[key].last_access = time.monotonic() 92 | 93 | key1 = 'gmail.com' 94 | await self.normalizer.mx_records(key1) 95 | 96 | self.assertNotIn('key-0', email_normalize.cache) 97 | 98 | key2 = 'github.com' 99 | await self.normalizer.mx_records(key2) 100 | self.assertNotIn(key1, email_normalize.cache) 101 | self.assertIn(key2, email_normalize.cache) 102 | 103 | @async_test 104 | async def test_cache_expiration(self): 105 | await self.normalizer.mx_records('gmail.com') 106 | cached_at = email_normalize.cache['gmail.com'].cached_at 107 | email_normalize.cache['gmail.com'].ttl = 1 108 | await asyncio.sleep(1) 109 | self.assertTrue(email_normalize.cache['gmail.com'].expired) 110 | await self.normalizer.mx_records('gmail.com') 111 | self.assertGreater( 112 | email_normalize.cache['gmail.com'].cached_at, cached_at) 113 | 114 | @async_test 115 | async def test_empty_mx_list(self): 116 | with mock.patch.object(self.normalizer, 'mx_records') as mx_records: 117 | mx_records.return_value = [] 118 | result = await self.normalizer.normalize('foo@bar.com') 119 | self.assertEqual(result.normalized_address, 'foo@bar.com') 120 | self.assertIsNone(result.mailbox_provider) 121 | self.assertListEqual(result.mx_records, []) 122 | 123 | @async_test 124 | async def test_failure_cached(self): 125 | key = str(uuid.uuid4()) 126 | records = await self.normalizer.mx_records(key) 127 | self.assertListEqual(records, []) 128 | self.assertIn(key, email_normalize.cache.keys()) 129 | 130 | @async_test 131 | async def test_failure_not_cached(self): 132 | email_normalize.cache_failures = False 133 | key = str(uuid.uuid4()) 134 | records = await self.normalizer.mx_records(key) 135 | self.assertListEqual(records, []) 136 | email_normalize.cache_failures = True 137 | 138 | @async_test 139 | async def test_weird_mx_list(self): 140 | with mock.patch.object(self.normalizer, 'mx_records') as recs: 141 | recs.return_value = [ 142 | (1, str(uuid.uuid4())), 143 | (10, 'aspmx.l.google.com') 144 | ] 145 | result = await self.normalizer.normalize('f.o.o+bar@gmail.com') 146 | self.assertEqual(result.normalized_address, 'foo@gmail.com') 147 | self.assertEqual(result.mailbox_provider, 'Google') 148 | --------------------------------------------------------------------------------