├── tests ├── __init__.py ├── utils │ ├── __init__.py │ └── test_email_filter.py ├── browsers │ ├── __init__.py │ ├── test_requests_browser.py │ └── test_chrome_browser.py ├── models │ ├── __init__.py │ └── test_page_data.py ├── data_extractors │ ├── __init__.py │ ├── test_email_extractor.py │ └── test_linkedin_extractor.py └── link_filters │ ├── __init__.py │ ├── test_default_link_filter.py │ ├── test_link_filter_base.py │ └── test_contact_link_filter.py ├── extract_emails ├── console │ ├── __init__.py │ └── application.py ├── models │ ├── __init__.py │ └── page_data.py ├── data_savers │ ├── __init__.py │ ├── data_saver.py │ └── csv_saver.py ├── utils │ ├── __init__.py │ ├── email_filter.py │ └── _top_level_domains.py ├── workers │ ├── __init__.py │ └── default_worker.py ├── errors │ ├── __init__.py │ └── errors.py ├── __init__.py ├── browsers │ ├── __init__.py │ ├── page_source_getter.py │ ├── httpx_browser.py │ └── chromium_browser.py ├── data_extractors │ ├── __init__.py │ ├── data_extractor.py │ ├── linkedin_extractor.py │ └── email_extractor.py └── link_filters │ ├── __init__.py │ ├── default_link_filter.py │ ├── link_filter_base.py │ └── contact_link_filter.py ├── docs ├── code │ ├── utils.md │ ├── errors.md │ ├── models.md │ ├── workers.md │ ├── browsers.md │ ├── link_filters.md │ └── data_extractors.md ├── index.md ├── quick_start │ ├── logs.md │ ├── save_data.md │ └── intro.md └── changelogs │ └── v5.md ├── images └── email.png ├── .flake8 ├── pytest.ini ├── tox.ini ├── .cursor └── rules │ ├── tests.mdc │ ├── general.mdc │ └── uv.mdc ├── setup.cfg ├── Makefile ├── .pre-commit-config.yaml ├── LICENSE ├── CONTRIBUTING.md ├── .gitignore ├── mkdocs.yml ├── pyproject.toml ├── README.md ├── CHANGELOG.md └── poetry.lock /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/browsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/link_filters/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /extract_emails/console/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/code/utils.md: -------------------------------------------------------------------------------- 1 | # Utils 2 | 3 | ::: extract_emails.utils.email_filter -------------------------------------------------------------------------------- /docs/code/errors.md: -------------------------------------------------------------------------------- 1 | # Errors 2 | 3 | ::: extract_emails.errors.errors 4 | -------------------------------------------------------------------------------- /docs/code/models.md: -------------------------------------------------------------------------------- 1 | # Models 2 | 3 | ::: extract_emails.models.page_data 4 | -------------------------------------------------------------------------------- /docs/code/workers.md: -------------------------------------------------------------------------------- 1 | # Workers 2 | 3 | ::: extract_emails.workers.default_worker -------------------------------------------------------------------------------- /images/email.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dmitriiweb/extract-emails/HEAD/images/email.png -------------------------------------------------------------------------------- /extract_emails/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .page_data import PageData 2 | 3 | __all__ = ("PageData",) 4 | -------------------------------------------------------------------------------- /extract_emails/data_savers/__init__.py: -------------------------------------------------------------------------------- 1 | from .csv_saver import CsvSaver 2 | 3 | __all__ = ("CsvSaver",) 4 | -------------------------------------------------------------------------------- /extract_emails/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .email_filter import email_filter 2 | 3 | __all__ = ("email_filter",) 4 | -------------------------------------------------------------------------------- /extract_emails/workers/__init__.py: -------------------------------------------------------------------------------- 1 | from .default_worker import DefaultWorker 2 | 3 | __all__ = ("DefaultWorker",) 4 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | select = C,E,F,W,B,B9 4 | ignore = E203, E501, W503 5 | exclude = __init__.py -------------------------------------------------------------------------------- /extract_emails/errors/__init__.py: -------------------------------------------------------------------------------- 1 | from .errors import BrowserImportError 2 | 3 | __all__ = ("BrowserImportError",) 4 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | slow: marks tests as slow (deselect with '-m "not slow"') 4 | asyncio_mode = auto -------------------------------------------------------------------------------- /extract_emails/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "6.0.1" 2 | from .workers import DefaultWorker 3 | 4 | __all__ = ("DefaultWorker",) 5 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | isolated_build=True 3 | envlist = py310,py311,py312,313 4 | 5 | [testenv] 6 | deps = pytest 7 | commmands = 8 | pytest tests 9 | -------------------------------------------------------------------------------- /extract_emails/errors/errors.py: -------------------------------------------------------------------------------- 1 | class BrowserImportError(Exception): 2 | """Error for cases when required libraries for browsers were not installed""" 3 | 4 | pass 5 | -------------------------------------------------------------------------------- /docs/code/browsers.md: -------------------------------------------------------------------------------- 1 | # Browsers 2 | 3 | ::: extract_emails.browsers.page_source_getter 4 | 5 | ::: extract_emails.browsers.chromium_browser 6 | 7 | ::: extract_emails.browsers.httpx_browser 8 | -------------------------------------------------------------------------------- /docs/code/link_filters.md: -------------------------------------------------------------------------------- 1 | # Link Filters 2 | 3 | ::: extract_emails.link_filters.link_filter_base 4 | ::: extract_emails.link_filters.default_link_filter 5 | ::: extract_emails.link_filters.contact_link_filter 6 | -------------------------------------------------------------------------------- /docs/code/data_extractors.md: -------------------------------------------------------------------------------- 1 | # Data Extractors 2 | 3 | ::: extract_emails.data_extractors.data_extractor 4 | ::: extract_emails.data_extractors.email_extractor 5 | ::: extract_emails.data_extractors.linkedin_extractor 6 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 |  2 | 3 | ### Index 4 | - [Quick Start](quick_start/intro.md#Intro) 5 | - [Code References](code/workers.md#Workers) 6 | -------------------------------------------------------------------------------- /extract_emails/browsers/__init__.py: -------------------------------------------------------------------------------- 1 | from .chromium_browser import ChromiumBrowser 2 | from .httpx_browser import HttpxBrowser 3 | from .page_source_getter import PageSourceGetter 4 | 5 | __all__ = ("PageSourceGetter", "ChromiumBrowser", "HttpxBrowser") 6 | -------------------------------------------------------------------------------- /extract_emails/data_extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_extractor import DataExtractor 2 | from .email_extractor import EmailExtractor 3 | from .linkedin_extractor import LinkedinExtractor 4 | 5 | __all__ = ("DataExtractor", "EmailExtractor", "LinkedinExtractor") 6 | -------------------------------------------------------------------------------- /extract_emails/link_filters/__init__.py: -------------------------------------------------------------------------------- 1 | from .contact_link_filter import ContactInfoLinkFilter 2 | from .default_link_filter import DefaultLinkFilter 3 | from .link_filter_base import LinkFilterBase 4 | 5 | __all__ = ("LinkFilterBase", "DefaultLinkFilter", "ContactInfoLinkFilter") 6 | -------------------------------------------------------------------------------- /tests/utils/test_email_filter.py: -------------------------------------------------------------------------------- 1 | from extract_emails.utils import email_filter 2 | 3 | 4 | def test_email_filter(): 5 | test_emails = ["email@email.com", "email@email.com", "2@pic.png"] 6 | filtered_emails = email_filter(test_emails) 7 | 8 | assert len(filtered_emails) == 1 9 | assert "email@email.com" in filtered_emails 10 | -------------------------------------------------------------------------------- /extract_emails/data_savers/data_saver.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterable 3 | 4 | from extract_emails.models import PageData 5 | 6 | 7 | class DataSaver(ABC): 8 | def __init__(self, **kwargs): 9 | pass 10 | 11 | @abstractmethod 12 | def save(self, data: Iterable[PageData]): 13 | pass 14 | -------------------------------------------------------------------------------- /.cursor/rules/tests.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | description: 3 | globs: tests/**.py 4 | alwaysApply: false 5 | --- 6 | # Tests Writing Rules 7 | 8 | - use pytest 9 | - each test - one function, don't write classes for tests 10 | - do not mark tests as @pytest.makr.asyncio, because in pyproject.toml settings up autodiscovering 11 | - run tests with: `pytest --cov=extract_emails -vv tests/` -------------------------------------------------------------------------------- /tests/data_extractors/test_email_extractor.py: -------------------------------------------------------------------------------- 1 | from extract_emails.data_extractors import EmailExtractor 2 | 3 | STRING = """ 4 | blah blah email@example.com blah blah 5 | blah blah "email2@example.com" blah blah 6 | blah blah "email2@example.com" blah blah 7 | """ 8 | 9 | 10 | def test_get_data(): 11 | email_extractor = EmailExtractor() 12 | emails = email_extractor.get_data(STRING) 13 | 14 | assert "email2@example.com" in emails 15 | assert len(emails) == 2 16 | -------------------------------------------------------------------------------- /docs/quick_start/logs.md: -------------------------------------------------------------------------------- 1 | # Logs 2 | 3 | There is [loguru](https://github.com/Delgan/loguru) library under the hood. 4 | 5 | ## Settings 6 | ```python 7 | import sys 8 | 9 | from loguru import logger 10 | 11 | logger.add( 12 | sys.stderr, 13 | format="{time} {level} {message}", 14 | filter="my_module", 15 | level="INFO", 16 | ) 17 | ``` 18 | 19 | ## Disable/Enable 20 | ```python 21 | from loguru import logger 22 | 23 | logger.disable('extract_emails') 24 | logger.enable('extract_emails') 25 | ``` 26 | -------------------------------------------------------------------------------- /tests/link_filters/test_default_link_filter.py: -------------------------------------------------------------------------------- 1 | from extract_emails.link_filters import DefaultLinkFilter 2 | 3 | 4 | def test_default_link_filter(): 5 | test_urls = [ 6 | "https://example.com/page1.html", 7 | "/page.html", 8 | "/page.html", 9 | "https://google.com", 10 | ] 11 | link_filter = DefaultLinkFilter("https://example.com/") 12 | filtered_urls = link_filter.filter(test_urls) 13 | 14 | assert "https://google.com" not in filtered_urls 15 | assert len(filtered_urls) == 2 16 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | 4 | [flake8] 5 | max-complexity = 24 6 | max-line-length = 90 7 | exclude = 8 | .git, 9 | .tox, 10 | .venv, 11 | __pycache__, 12 | build, 13 | dist, 14 | docs, 15 | geopy.egg-info 16 | 17 | [isort] 18 | ; https://github.com/timothycrosley/isort#multi-line-output-modes 19 | combine_as_imports = True 20 | force_grid_wrap = 0 21 | include_trailing_comma = True 22 | known_first_party = test 23 | line_length = 88 24 | multi_line_output = 3 25 | not_skip = __init__.py 26 | -------------------------------------------------------------------------------- /extract_emails/data_extractors/data_extractor.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class DataExtractor(ABC): 5 | """Base class for all data extractors""" 6 | 7 | @property 8 | @abstractmethod 9 | def name(self) -> str: 10 | """Name of the data extractor, e.g. email, linkedin""" 11 | 12 | @abstractmethod 13 | def get_data(self, page_source: str) -> set[str]: 14 | """Extract needed data from a string 15 | 16 | Args: 17 | page_source: webpage content 18 | 19 | Returns: 20 | Set of data, e.g. {'email@email.com', 'email2@email.com'} 21 | """ 22 | -------------------------------------------------------------------------------- /docs/changelogs/v5.md: -------------------------------------------------------------------------------- 1 | # V5 2 | ## 5.3.0 3 | ### Changed 4 | - Add custom save mode to csv data saver 5 | 6 | ## 5.2.0 7 | ### Added 8 | - CLI tool 9 | - csv data saver 10 | 11 | ## 5.1.3 12 | ### Changed 13 | - Update dependencies 14 | 15 | ## 5.1.2 16 | ### Added 17 | - Python 3.10 support 18 | - Add CHANGELOG.md 19 | 20 | ## 5.1.0 21 | ### Added 22 | - Add save_as_csv class method to `PageData` model 23 | - Add logs to DefaultWorker 24 | ### Changed 25 | - Check if needed libraries for browsers were installed. If not will show user-friendly error 26 | - Small improvements in the code 27 | 28 | ## 5.0.2 29 | ### Changed 30 | - Fix imports for factories and DefaultWorker 31 | -------------------------------------------------------------------------------- /tests/browsers/test_requests_browser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from extract_emails.browsers import HttpxBrowser 4 | 5 | 6 | @pytest.fixture 7 | def browser(): 8 | browser = HttpxBrowser() 9 | browser.start() 10 | yield browser 11 | browser.stop() 12 | 13 | 14 | @pytest.mark.slow 15 | def test_get_page_source(browser): 16 | url = "https://en.wikipedia.org/wiki/Python_(programming_language)" 17 | page_source = browser.get_page_source(url) 18 | assert "Python (programming language)" in page_source 19 | 20 | 21 | @pytest.mark.slow 22 | def test_get_page_source_wrong_url(browser): 23 | url = "ttps://en.wikipedia.org/wiki/Python_(programming_language)" 24 | page_source = browser.get_page_source(url) 25 | assert page_source == "" 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | version := $(shell uv run python -c 'from extract_emails import __version__; print(__version__)') 2 | 3 | .PHONY: test 4 | test: 5 | uv run pytest --cov=extract_emails -vv -m "not slow" tests/ 6 | 7 | .PHONY: test-all 8 | test-all: 9 | uv run pytest --cov=extract_emails -vv tests/ 10 | 11 | .PHONY: format 12 | format: 13 | uv run ruff check extract_emails tests --select I --fix 14 | uv run ruff format extract_emails tests 15 | 16 | .PHONY: lint 17 | lint: 18 | uv run ruff check extract_emails 19 | uv run mypy extract_emails 20 | 21 | .PHONY: docs-serve 22 | docs-serve: 23 | uv run mkdocs serve 24 | 25 | .PHONY: docs-publish 26 | docs-publish: 27 | uv run mkdocs gh-deploy --force 28 | 29 | .PHONY: publish 30 | publish: 31 | uv build 32 | uv publish 33 | -------------------------------------------------------------------------------- /extract_emails/utils/email_filter.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | from ._top_level_domains import TOP_LEVEL_DOMAINS 4 | 5 | 6 | def email_filter(emails: Iterable[str]) -> set[str]: 7 | """Remove duplicated emails and strings looks like emails (2@pic.png) 8 | 9 | Examples: 10 | >>> from extract_emails.utils import email_filter 11 | >>> test_emails = ["email@email.com", "email@email.com", "2@pic.png"] 12 | >>> filtered_emails = email_filter(test_emails) 13 | >>> filtered_emails 14 | {"email@email.com"} 15 | 16 | Args: 17 | emails: List of new emails 18 | 19 | Returns: 20 | List of filtered emails 21 | """ 22 | return set( 23 | email for email in emails if "." + email.split(".")[-1] in TOP_LEVEL_DOMAINS 24 | ) 25 | -------------------------------------------------------------------------------- /tests/browsers/test_chrome_browser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pytest_asyncio 3 | 4 | from extract_emails.browsers import ChromiumBrowser 5 | 6 | @pytest_asyncio.fixture 7 | async def browser(): 8 | browser = ChromiumBrowser() 9 | await browser.astart() 10 | yield browser 11 | await browser.astop() 12 | 13 | @pytest.mark.slow 14 | async def test_get_page_source(browser): 15 | url = "https://en.wikipedia.org/wiki/Python_(programming_language)" 16 | page_source = await browser.aget_page_source(url) 17 | assert "Python (programming language)" in page_source 18 | 19 | @pytest.mark.slow 20 | async def test_get_page_source_wrong_url(browser): 21 | url = "ttps://en.wikipedia.org/wiki/Python_(programming_language)" 22 | page_source = await browser.aget_page_source(url) 23 | assert page_source == "" 24 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 20.8b1 4 | hooks: 5 | - id: black 6 | 7 | - repo: https://gitlab.com/pycqa/flake8 8 | rev: 3.8.4 9 | hooks: 10 | - id: flake8 11 | 12 | - repo: https://github.com/timothycrosley/isort 13 | rev: 5.7.0 14 | hooks: 15 | - id: isort 16 | additional_dependencies: [toml] 17 | exclude: ^.*/?setup\.py$ 18 | 19 | - repo: https://github.com/pre-commit/pre-commit-hooks 20 | rev: v3.4.0 21 | hooks: 22 | - id: trailing-whitespace 23 | exclude: | 24 | (?x)( 25 | ^tests/.*/fixtures/.* 26 | | ^tests/console/commands/debug/test_resolve.py 27 | ) 28 | - id: end-of-file-fixer 29 | exclude: ^tests/.*/fixtures/.* 30 | - id: debug-statements 31 | -------------------------------------------------------------------------------- /tests/link_filters/test_link_filter_base.py: -------------------------------------------------------------------------------- 1 | from extract_emails.link_filters import LinkFilterBase 2 | 3 | HTML_EXAMPLE = """ 4 | 5 |
6 |