├── tests
    ├── __init__.py
    ├── test_added.py
    ├── test_conformance.py
    └── cases
    │   ├── added.yml
    │   ├── extract.yml
    │   └── validate.yml
├── twitter_text
    ├── regexp
    │   ├── __init__.py
    │   ├── invalid_chars_group.py
    │   ├── punct.py
    │   ├── valid_port_number.py
    │   ├── valid_punycode.py
    │   ├── cyrillic_letters_and_marks.py
    │   ├── spaces_group.py
    │   ├── directional_markers_group.py
    │   ├── valid_url_query_ending_chars.py
    │   ├── valid_url_query_chars.py
    │   ├── invalid_url_without_protocol_preceding_chars.py
    │   ├── latin_accent_chars.py
    │   ├── invalid_chars.py
    │   ├── valid_domain_chars.py
    │   ├── valid_domain_name.py
    │   ├── valid_subdomain.py
    │   ├── valid_url_preceding_chars.py
    │   ├── valid_tco_url.py
    │   ├── valid_general_url_path_chars.py
    │   ├── invalid_domain_chars.py
    │   ├── valid_ascii_domain.py
    │   ├── valid_domain.py
    │   ├── valid_url_balanced_parens.py
    │   ├── valid_url_path_ending_chars.py
    │   ├── valid_url_path.py
    │   ├── extract_url.py
    │   ├── valid_cctld.py
    │   ├── emoji.py
    │   └── valid_gtld.py
    ├── has_invalid_characters.py
    ├── __init__.py
    ├── regex_supplant.py
    ├── extract_emojis.py
    ├── get_character_weight.py
    ├── config.py
    ├── extract_urls.py
    └── parse_tweet.py
├── pytest.ini
├── .gitignore
├── tox.ini
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report_ja.md
    │   └── bug_report_en.md
    └── workflows
    │   ├── release.yml
    │   └── test.yml
├── .readthedocs.yml
├── doc
    ├── index.rst
    └── conf.py
├── CHANGELOG.md
├── Makefile
├── LICENSE
├── .devcontainer
    └── devcontainer.json
├── pyproject.toml
├── README.rst
└── poetry.lock


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths = tests
3 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/invalid_chars_group.py:
--------------------------------------------------------------------------------
1 | invalid_chars_group = r'\uFFFE\uFEFF\uFFFF'
2 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/punct.py:
--------------------------------------------------------------------------------
1 | punct = r"\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~\$/"
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea/
2 | /venv/
3 | __pycache__/
4 | /dist/
5 | /build/
6 | *.egg-info
7 | /.tox/
8 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_port_number.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | valid_port_number = re.compile(r'[0-9]+')
4 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_punycode.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | valid_punycode = re.compile(r'(?:xn--[\-0-9a-z]+)')
4 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/cyrillic_letters_and_marks.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | cyrillic_letters_and_marks = re.compile(r'\u0400-\u04FF')
4 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/spaces_group.py:
--------------------------------------------------------------------------------
1 | spaces_group = r'\x09-\x0D\x20\x85\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000'
2 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/directional_markers_group.py:
--------------------------------------------------------------------------------
1 | directional_markers_group = r'\u202A-\u202E\u061C\u200E\u200F\u2066\u2067\u2068\u2069'
2 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_url_query_ending_chars.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | valid_url_query_ending_chars = re.compile(r'[a-z0-9\-_&=#/]', re.IGNORECASE)
4 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist = py37, py38, py39, py310, py311
 3 | 
 4 | [testenv]
 5 | commands =
 6 |   pytest
 7 | deps =
 8 |   pytest
 9 |   PyYAML
10 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_url_query_chars.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | valid_url_query_chars = re.compile(r"[a-z0-9!?*'@();:&=+$/%#\[\]\-_.,~|]", re.IGNORECASE)
4 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/invalid_url_without_protocol_preceding_chars.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | invalid_url_without_protocol_preceding_chars = re.compile(r'[-_./]$')
4 | 


--------------------------------------------------------------------------------
/twitter_text/has_invalid_characters.py:
--------------------------------------------------------------------------------
1 | from .regexp.invalid_chars import invalid_chars
2 | 
3 | 
4 | def has_invalid_characters(text: str) -> bool:
5 |     return invalid_chars.search(text) is not None
6 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/latin_accent_chars.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | latin_accent_chars = re.compile(
4 |     r'\xC0-\xD6\xD8-\xF6\xF8-\xFF\u0100-\u024F\u0253\u0254\u0256\u0257\u0259\u025B\u0263\u0268\u026F\u0272\u0289\u028B\u02BB\u0300-\u036F\u1E00-\u1EFF'
5 | )
6 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/invalid_chars.py:
--------------------------------------------------------------------------------
1 | from .invalid_chars_group import invalid_chars_group
2 | from ..regex_supplant import regex_supplant
3 | 
4 | invalid_chars = regex_supplant(
5 |     r'[#{invalid_chars_group}]',
6 |     {'invalid_chars_group': invalid_chars_group}
7 | )
8 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_domain_chars.py:
--------------------------------------------------------------------------------
 1 | from .invalid_domain_chars import invalid_domain_chars
 2 | from ..regex_supplant import regex_supplant
 3 | 
 4 | valid_domain_chars = regex_supplant(
 5 |     r'[^#{invalid_domain_chars}]',
 6 |     {
 7 |         'invalid_domain_chars': invalid_domain_chars
 8 |     }
 9 | )
10 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_domain_name.py:
--------------------------------------------------------------------------------
 1 | from .valid_domain_chars import valid_domain_chars
 2 | from ..regex_supplant import regex_supplant
 3 | 
 4 | valid_domain_name = regex_supplant(
 5 |     r'(?:(?:#{valid_domain_chars}(?:-|#{valid_domain_chars})*)?#{valid_domain_chars}\.)',
 6 |     {
 7 |         'valid_domain_chars': valid_domain_chars
 8 |     }
 9 | )
10 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_subdomain.py:
--------------------------------------------------------------------------------
 1 | from .valid_domain_chars import valid_domain_chars
 2 | from ..regex_supplant import regex_supplant
 3 | 
 4 | valid_subdomain = regex_supplant(
 5 |     r'(?:(?:#{valid_domain_chars}(?:[_-]|#{valid_domain_chars})*)?#{valid_domain_chars}\.)',
 6 |     {
 7 |         'valid_domain_chars': valid_domain_chars
 8 |     }
 9 | )
10 | 


--------------------------------------------------------------------------------
/twitter_text/__init__.py:
--------------------------------------------------------------------------------
 1 | from twitter_text.parse_tweet import parse_tweet, ParsedResult
 2 | from twitter_text.extract_urls import extract_urls, extract_urls_with_indices
 3 | from twitter_text.extract_emojis import extract_emojis_with_indices
 4 | 
 5 | __all__ = [
 6 |     'ParsedResult',
 7 |     'parse_tweet',
 8 |     'extract_urls',
 9 |     'extract_urls_with_indices',
10 |     'extract_emojis_with_indices',
11 | ]
12 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report_ja.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: バグ報告
 3 | about: 日本語でのバグ報告
 4 | title: "[BUG] "
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **バグの概要**
11 | 
12 | **本来期待される動作**
13 | 
14 | **バグを再現する手順**
15 | 1. '...' を開く
16 | 2. '...' をクリック
17 | 3. '....' のところまでスクロール
18 | 4. エラーが発生
19 | 
20 | **環境:**
21 |  - OS: [例) macOS Mojave version 10.14.6]
22 |  - Python バージョン: [例) 3.7]
23 |  - パッケージバージョン [例) twitter-text-python==1.0.2]
24 | 
25 | **その他の情報**
26 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_url_preceding_chars.py:
--------------------------------------------------------------------------------
 1 | from .directional_markers_group import directional_markers_group
 2 | from .invalid_chars_group import invalid_chars_group
 3 | from ..regex_supplant import regex_supplant
 4 | 
 5 | valid_url_preceding_chars = regex_supplant(
 6 |     r'(?:[^A-Za-z0-9@＠$#＃#{invalid_chars_group}]|[#{directional_markers_group}]|^)',
 7 |     {
 8 |         'invalid_chars_group': invalid_chars_group,
 9 |         'directional_markers_group': directional_markers_group
10 |     }
11 | )
12 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | build:
 9 |   os: ubuntu-22.04
10 |   tools:
11 |     python: "3.11"
12 |   jobs:
13 |     post_create_environment:
14 |       - pip install poetry
15 |     post_install:
16 |       - poetry install --with docs
17 | 
18 | # Build documentation in the doc/ directory with Sphinx
19 | sphinx:
20 |   configuration: doc/conf.py
21 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. twitter-text-python documentation master file, created by
 2 |    sphinx-quickstart on Fri Jul 26 22:54:53 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 |    :caption: Contents:
10 | 
11 | 
12 | twitter-text-python
13 | ===================
14 | 
15 | .. include:: ../README.rst
16 |    :start-line: 7
17 | 
18 | 
19 | API References
20 | ==============
21 | 
22 | .. automodule:: twitter_text
23 |    :members:
24 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - '*'
 7 | 
 8 | jobs:
 9 |   release:
10 |     name: Build and release to PyPI
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       id-token: write
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - uses: actions/setup-python@v4
18 |       with:
19 |         python-version: 3.x
20 |     - uses: snok/install-poetry@v1
21 |     - run: poetry install --no-root -v
22 |     - run: poetry build
23 |     - uses: pypa/gh-action-pypi-publish@release/v1
24 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_tco_url.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .valid_url_query_chars import valid_url_query_chars
 4 | from .valid_url_query_ending_chars import valid_url_query_ending_chars
 5 | from ..regex_supplant import regex_supplant
 6 | 
 7 | valid_tco_url = regex_supplant(
 8 |     r'^https?:\/\/t\.co\/([a-z0-9]+)(?:\?#{valid_url_query_chars}*#{valid_url_query_ending_chars})?',
 9 |     {
10 |         'valid_url_query_chars': valid_url_query_chars,
11 |         'valid_url_query_ending_chars': valid_url_query_ending_chars
12 |     },
13 |     re.IGNORECASE
14 | )
15 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v3
18 |     - uses: actions/setup-python@v4
19 |       with:
20 |         python-version: ${{ matrix.python-version }}
21 |     - uses: snok/install-poetry@v1
22 |     - run: poetry install --no-root -v
23 |     - run: poetry run pytest
24 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_general_url_path_chars.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .cyrillic_letters_and_marks import cyrillic_letters_and_marks
 4 | from .latin_accent_chars import latin_accent_chars
 5 | from ..regex_supplant import regex_supplant
 6 | 
 7 | valid_general_url_path_chars = regex_supplant(
 8 |     re.compile(
 9 |         r"[a-z#{cyrillic_letters_and_marks}0-9!\*';:=\+,\.\$\/%#\[\]\-\u2013_~@\|&#{latin_accent_chars}]",
10 |         re.IGNORECASE
11 |     ),
12 |     {
13 |         'cyrillic_letters_and_marks': cyrillic_letters_and_marks,
14 |         'latin_accent_chars': latin_accent_chars
15 |     }
16 | )
17 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/invalid_domain_chars.py:
--------------------------------------------------------------------------------
 1 | from .directional_markers_group import directional_markers_group
 2 | from .invalid_chars_group import invalid_chars_group
 3 | from .punct import punct
 4 | from .spaces_group import spaces_group
 5 | from ..regex_supplant import regex_supplant
 6 | 
 7 | invalid_domain_chars = regex_supplant(
 8 |     r'#{punct}#{spaces_group}#{invalid_chars_group}#{directional_markers_group}',
 9 |     {
10 |         'punct': punct,
11 |         'spaces_group': spaces_group,
12 |         'invalid_chars_group': invalid_chars_group,
13 |         'directional_markers_group': directional_markers_group
14 |     }
15 | )
16 | 


--------------------------------------------------------------------------------
/twitter_text/regex_supplant.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Dict, Union, Pattern, Match
 3 | 
 4 | 
 5 | def regex_supplant(regex: Union[str, Pattern], dic: Dict[str, Union[str, Pattern]], flags=0) -> Pattern:
 6 |     def repl(match: Match) -> str:
 7 |         name = match.group(1)
 8 |         pattern = dic.get(name, '')
 9 |         return pattern if isinstance(pattern, str) else pattern.pattern
10 | 
11 |     regex_str = regex if isinstance(regex, str) else regex.pattern
12 |     new_flags = flags if isinstance(regex, str) else regex.flags | flags
13 |     assembled_pat = re.sub(r'#\{(\w+)\}', repl, regex_str)
14 | 
15 |     return re.compile(assembled_pat, new_flags)
16 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_ascii_domain.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .latin_accent_chars import latin_accent_chars
 4 | from .valid_cctld import valid_cctld
 5 | from .valid_gtld import valid_gtld
 6 | from .valid_punycode import valid_punycode
 7 | from ..regex_supplant import regex_supplant
 8 | 
 9 | valid_ascii_domain = regex_supplant(
10 |     re.compile(
11 |         r'(?:(?:[\-a-z0-9#{latin_accent_chars}]+)\.)+(?:#{valid_gtld}|#{valid_cctld}|#{valid_punycode})',
12 |         re.IGNORECASE
13 |     ),
14 |     {
15 |         'latin_accent_chars': latin_accent_chars,
16 |         'valid_gtld': valid_gtld,
17 |         'valid_cctld': valid_cctld,
18 |         'valid_punycode': valid_punycode
19 |     }
20 | )
21 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_domain.py:
--------------------------------------------------------------------------------
 1 | from .valid_cctld import valid_cctld
 2 | from .valid_domain_name import valid_domain_name
 3 | from .valid_gtld import valid_gtld
 4 | from .valid_punycode import valid_punycode
 5 | from .valid_subdomain import valid_subdomain
 6 | from ..regex_supplant import regex_supplant
 7 | 
 8 | valid_domain = regex_supplant(
 9 |     r'(?:#{valid_subdomain}*#{valid_domain_name}(?:#{valid_gtld}|#{valid_cctld}|#{valid_punycode}))',
10 |     {
11 |         'valid_subdomain': valid_subdomain,
12 |         'valid_domain_name': valid_domain_name,
13 |         'valid_gtld': valid_gtld,
14 |         'valid_cctld': valid_cctld,
15 |         'valid_punycode': valid_punycode
16 |     }
17 | )
18 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | ## [3.0.0] - 2023-05-24
 3 | - [change] Drop support of Python 3.6.
 4 | - [change] Support Python 3.10 and 3.11.
 5 | - [change] Remove dependency on the package `attrs`.
 6 | 
 7 | ## [2.0.1] - 2023-05-24
 8 | - [fix] Loosen the version requirement of the package `attrs`.
 9 | 
10 | ## [2.0.0] - 2021-03-29
11 | - [change] Drop support of Python 3.5.
12 | - [change] Support Python 3.8 and 3.9.
13 | 
14 | ## [1.0.2] - 2020-05-25
15 | - [fix] Loosen the version requirement of the package `attrs`.
16 | 
17 | ## [1.0.1] - 2020-05-24
18 | - [fix] Fix a bug where CRLF (`\r\n`) is counted as two characters.
19 | - [fix] Prevent `UnicodeDecodeError` in Windows environment while installing the package.
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = doc
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_url_balanced_parens.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .valid_general_url_path_chars import valid_general_url_path_chars
 4 | from ..regex_supplant import regex_supplant
 5 | 
 6 | valid_url_balanced_parens = regex_supplant(
 7 |     '\\(' +
 8 |     '(?:' +
 9 |     '#{valid_general_url_path_chars}+' +
10 |     '|' +
11 |     # allow one nested level of balanced parentheses
12 |     '(?:' +
13 |     '#{valid_general_url_path_chars}*' +
14 |     '\\(' +
15 |     '#{valid_general_url_path_chars}+' +
16 |     '\\)' +
17 |     '#{valid_general_url_path_chars}*' +
18 |     ')' +
19 |     ')' +
20 |     '\\)',
21 |     {
22 |         'valid_general_url_path_chars': valid_general_url_path_chars
23 |     },
24 |     re.IGNORECASE
25 | )
26 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_url_path_ending_chars.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .cyrillic_letters_and_marks import cyrillic_letters_and_marks
 4 | from .latin_accent_chars import latin_accent_chars
 5 | from .valid_url_balanced_parens import valid_url_balanced_parens
 6 | from ..regex_supplant import regex_supplant
 7 | 
 8 | valid_url_path_ending_chars = regex_supplant(
 9 |     re.compile(
10 |         r'[\+\-a-z#{cyrillic_letters_and_marks}0-9=_#\/#{latin_accent_chars}]|(?:#{valid_url_balanced_parens})',
11 |         re.IGNORECASE
12 |     ),
13 |     {
14 |         'cyrillic_letters_and_marks': cyrillic_letters_and_marks,
15 |         'latin_accent_chars': latin_accent_chars,
16 |         'valid_url_balanced_parens': valid_url_balanced_parens
17 |     }
18 | )
19 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report_en.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG] "
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Expected behavior**
14 | A clear and concise description of what you expected to happen.
15 | 
16 | **To Reproduce**
17 | Steps to reproduce the behavior:
18 | 1. Go to '...'
19 | 2. Click on '....'
20 | 3. Scroll down to '....'
21 | 4. See error
22 | 
23 | **Environment:**
24 |  - OS: [e.g. macOS Mojave version 10.14.6]
25 |  - Python version: [e.g. 3.7]
26 |  - Package version [e.g. twitter-text-python==1.0.2]
27 | 
28 | **Additional context**
29 | Add any other context about the problem here.
30 | 


--------------------------------------------------------------------------------
/twitter_text/extract_emojis.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from .regexp.emoji import emoji
 4 | 
 5 | 
 6 | def extract_emojis_with_indices(text: str) -> List[dict]:
 7 |     """
 8 |     Extract emojis present in ``text`` along with their Unicode code point indices.
 9 | 
10 |     >>> extract_emojis_with_indices('text 😷')
11 |     {'emoji': '😷', 'indices': [5, 6]}
12 | 
13 |     >>> extract_emojis_with_indices('🙋🏽👨‍🎤')
14 |     [{'emoji': '🙋🏽', 'indices': [0, 2]}, {'emoji': '👨\u200d🎤', 'indices': [2, 5]}]
15 |     """
16 |     def generator():
17 |         for match in emoji.finditer(text):
18 |             yield {
19 |                 'emoji': match.group(0),
20 |                 'indices': [match.start(), match.end()]
21 |             }
22 | 
23 |     return list(generator())
24 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_url_path.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from .valid_general_url_path_chars import valid_general_url_path_chars
 4 | from .valid_url_balanced_parens import valid_url_balanced_parens
 5 | from .valid_url_path_ending_chars import valid_url_path_ending_chars
 6 | from ..regex_supplant import regex_supplant
 7 | 
 8 | valid_url_path = regex_supplant(
 9 |     '(?:' +
10 |     '(?:' +
11 |     '#{valid_general_url_path_chars}*' +
12 |     '(?:#{valid_url_balanced_parens}#{valid_general_url_path_chars}*)*' +
13 |     '#{valid_url_path_ending_chars}' +
14 |     ')|(?:@#{valid_general_url_path_chars}+/)' +
15 |     ')',
16 |     {
17 |         'valid_general_url_path_chars': valid_general_url_path_chars,
18 |         'valid_url_balanced_parens': valid_url_balanced_parens,
19 |         'valid_url_path_ending_chars': valid_url_path_ending_chars
20 |     },
21 |     re.IGNORECASE
22 | )
23 | 


--------------------------------------------------------------------------------
/twitter_text/get_character_weight.py:
--------------------------------------------------------------------------------
 1 | def get_character_weight(char: str, options: dict) -> int:
 2 |     """
 3 |     Return an integer weight corresponding to `char`.
 4 |     The weight is determined by the Unicode code point of `char` and ranges specified by `options`.
 5 | 
 6 |     >>> char = '日'
 7 |     >>> options = {
 8 |     ...     'default_weight': 200,
 9 |     ...     'ranges': [
10 |     ...         { 'start': 0, 'end': 4351, 'weight': 100 },
11 |     ...         { 'start': 8192, 'end': 8205, 'weight': 100 }
12 |     ...     ]
13 |     >>> get_character_weight(char, options)
14 |     200
15 |     """
16 |     ranges = options['ranges']
17 |     char_code_point = ord(char[0])
18 |     match = [range['weight'] for range in ranges if range['start'] <= char_code_point <= range['end']]
19 |     weight = match[0] if match != [] else options['default_weight']
20 | 
21 |     return weight
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 | Copyright (c) 2019 swen128
3 | 
4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
5 | 
6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
7 | 
8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
9 | 


--------------------------------------------------------------------------------
/tests/test_added.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import pytest
 4 | import yaml
 5 | 
 6 | from twitter_text import parse_tweet, extract_urls_with_indices
 7 | 
 8 | 
 9 | def read_yaml(path) -> dict:
10 |     with open(path, mode='r', encoding='utf-8') as f:
11 |         return yaml.safe_load(f)
12 | 
13 | 
14 | def get_table(test_cases: dict, group_name: str) -> Tuple[str, List[list]]:
15 |     header = ",".join(test_cases[group_name][0].keys())
16 |     values = [list(case.values()) for case in test_cases[group_name]]
17 |     return header, values
18 | 
19 | 
20 | def parametrize(test_cases: dict, group_name: str):
21 |     return pytest.mark.parametrize(*get_table(test_cases, group_name))
22 | 
23 | 
24 | added = read_yaml('tests/cases/added.yml')['tests']
25 | 
26 | 
27 | @parametrize(added, 'ParseTweet')
28 | def test_added_parse_tweet(description: str, text: str, expected: dict):
29 |     assert parse_tweet(text).asdict() == expected
30 | 
31 | 
32 | @parametrize(added, 'ExtractUrlsWithIndices')
33 | def test_added_extract_urls_with_indices(description: str, text: str, expected: dict):
34 |     assert extract_urls_with_indices(text) == expected
35 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python
 3 | {
 4 | 	"name": "Python 3",
 5 | 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 6 | 	"image": "mcr.microsoft.com/devcontainers/python:0-3.11-bullseye",
 7 | 	"features": {
 8 | 		"ghcr.io/devcontainers-contrib/features/poetry:2": {
 9 | 			"version": "latest"
10 | 		},
11 | 		"ghcr.io/devcontainers-contrib/features/tox:2": {
12 | 			"version": "latest"
13 | 		}
14 | 	},
15 | 
16 | 	// Features to add to the dev container. More info: https://containers.dev/features.
17 | 	// "features": {},
18 | 
19 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
20 | 	// "forwardPorts": [],
21 | 
22 | 	// Use 'postCreateCommand' to run commands after the container is created.
23 | 	"postCreateCommand": "poetry install --with docs"
24 | 
25 | 	// Configure tool-specific properties.
26 | 	// "customizations": {},
27 | 
28 | 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
29 | 	// "remoteUser": "root"
30 | }
31 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "twitter-text-parser"
 3 | version = "3.0.0"
 4 | description = "A library to parse or validate Twitter texts properly"
 5 | authors = ["swen128 <fujjisaaan@gmail.com>"]
 6 | readme = "README.rst"
 7 | repository = "https://github.com/swen128/twitter-text-python"
 8 | packages = [
 9 |     {include = "twitter_text"},
10 | ]
11 | classifiers = [
12 |     "Programming Language :: Python :: 3.7",
13 |     "Programming Language :: Python :: 3.8",
14 |     "Programming Language :: Python :: 3.9",
15 |     "Programming Language :: Python :: 3.10",
16 |     "Programming Language :: Python :: 3.11",
17 |     "License :: OSI Approved :: MIT License",
18 |     "Operating System :: OS Independent",
19 |     "Intended Audience :: Developers",
20 |     "Topic :: Text Processing"
21 | ]
22 | 
23 | [tool.poetry.dependencies]
24 | python = "^3.7"
25 | 
26 | [tool.poetry.group.test.dependencies]
27 | pyyaml = "^6.0"
28 | pytest = "^7.3.1"
29 | 
30 | [tool.poetry.group.docs]
31 | optional = true
32 | 
33 | [tool.poetry.group.docs.dependencies]
34 | sphinx = {version = "^6.2.1", python = "^3.8"}
35 | docutils = "^0.18"
36 | sphinx-rtd-theme = "^1.2.1"
37 | 
38 | [build-system]
39 | requires = ["poetry-core>=1.0.0"]
40 | build-backend = "poetry.core.masonry.api"
41 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/extract_url.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from .valid_domain import valid_domain
 3 | from .valid_url_query_chars import valid_url_query_chars
 4 | from .valid_url_query_ending_chars import valid_url_query_ending_chars
 5 | from .valid_port_number import valid_port_number
 6 | from .valid_url_path import valid_url_path
 7 | from .valid_url_preceding_chars import valid_url_preceding_chars
 8 | from ..regex_supplant import regex_supplant
 9 | 
10 | extract_url = regex_supplant(
11 |     '(' +  # $1 total match
12 |     '(#{valid_url_preceding_chars})' +  # $2 Preceding character
13 |     '(' +  # $3 URL
14 |     '(https?:\\/\\/)?' +  # $4 Protocol (optional)
15 |     '(#{valid_domain})' +  # $5 Domain(s)
16 |     '(?::(#{valid_port_number}))?' +  # $6 Port number (optional)
17 |     '(\\/#{valid_url_path}*)?' +  # $7 URL Path
18 |     '(\\?#{valid_url_query_chars}*#{valid_url_query_ending_chars})?' +  # $8 Query String
19 |     ')' +
20 |     ')',
21 |     {
22 |         'valid_domain': valid_domain,
23 |         'valid_url_query_chars': valid_url_query_chars,
24 |         'valid_url_query_ending_chars': valid_url_query_ending_chars,
25 |         'valid_port_number': valid_port_number,
26 |         'valid_url_path': valid_url_path,
27 |         'valid_url_preceding_chars': valid_url_preceding_chars
28 |     },
29 |     re.IGNORECASE
30 | )
31 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_cctld.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | valid_cctld = re.compile(
 4 |     '(?:(?:' +
 5 |     '한국|香港|澳門|新加坡|台灣|台湾|中國|中国|გე|ລາວ|ไทย|ලංකා|ഭാരതം|ಭಾರತ|భారత్|சிங்கப்பூர்|இலங்கை|இந்தியா|ଭାରତ|' +
 6 |     'ભારત|ਭਾਰਤ|ভাৰত|ভারত|বাংলা|भारोत|भारतम्|भारत|ڀارت|پاکستان|موريتانيا|مليسيا|مصر|قطر|فلسطين|عمان|' +
 7 |     'عراق|سورية|سودان|تونس|بھارت|بارت|ایران|امارات|المغرب|السعودية|الجزائر|البحرين|الاردن|հայ|қаз|' +
 8 |     'укр|срб|рф|мон|мкд|ею|бел|бг|ευ|ελ|zw|zm|za|yt|ye|ws|wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|um|uk|' +
 9 |     'ug|ua|tz|tw|tv|tt|tr|tp|to|tn|tm|tl|tk|tj|th|tg|tf|td|tc|sz|sy|sx|sv|su|st|ss|sr|so|sn|sm|sl|' +
10 |     'sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|rs|ro|re|qa|py|pw|pt|ps|pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|' +
11 |     'nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|mx|mw|mv|mu|mt|ms|mr|mq|mp|mo|mn|mm|ml|mk|mh|mg|mf|' +
12 |     'me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|ky|kw|kr|kp|kn|km|ki|kh|kg|ke|jp|jo|jm|je|it|' +
13 |     'is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|gu|gt|gs|gr|gq|gp|gn|gm|gl|gi|gh|gg|gf|ge|' +
14 |     'gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eh|eg|ee|ec|dz|do|dm|dk|dj|de|cz|cy|cx|cw|cv|cu|cr|co|' +
15 |     'cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bq|bo|bn|bm|bl|bj|bi|bh|bg|bf|be|bd|bb|' +
16 |     'ba|az|ax|aw|au|at|as|ar|aq|ao|an|am|al|ai|ag|af|ae|ad|ac' +
17 |     ')(?=[^0-9a-zA-Z@+-]|$))'
18 | )
19 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # http://www.sphinx-doc.org/en/master/config
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | 
16 | import sphinx_rtd_theme
17 | 
18 | sys.path.insert(0, os.path.abspath('../'))
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'twitter-text-python'
23 | copyright = '2019, swen128'
24 | author = 'swen128'
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | # Add any Sphinx extension module names here, as strings. They can be
29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
30 | # ones.
31 | extensions = [
32 |     'sphinx.ext.autodoc'
33 | ]
34 | 
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 | 
38 | # List of patterns, relative to source directory, that match files and
39 | # directories to ignore when looking for source files.
40 | # This pattern also affects html_static_path and html_extra_path.
41 | exclude_patterns = []
42 | 
43 | # -- Options for HTML output -------------------------------------------------
44 | 
45 | # The theme to use for HTML and HTML Help pages.  See the documentation for
46 | # a list of builtin themes.
47 | #
48 | html_theme = 'sphinx_rtd_theme'
49 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
50 | 
51 | # Add any paths that contain custom static files (such as style sheets) here,
52 | # relative to this directory. They are copied after the builtin static files,
53 | # so a file named "default.css" will overwrite the builtin "default.css".
54 | html_static_path = ['_static']
55 | 
56 | # As suggested in https://stackoverflow.com/a/56448499
57 | master_doc = 'index'
58 | 


--------------------------------------------------------------------------------
/tests/test_conformance.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import pytest
 4 | import yaml
 5 | 
 6 | from twitter_text import parse_tweet, extract_urls, extract_urls_with_indices
 7 | 
 8 | 
 9 | def read_yaml(path) -> dict:
10 |     with open(path, mode='r', encoding='utf-8') as f:
11 |         return yaml.safe_load(f)
12 | 
13 | 
14 | def get_table(test_cases: dict, group_name: str) -> Tuple[str, List[list]]:
15 |     header = ",".join(test_cases[group_name][0].keys())
16 |     values = [list(case.values()) for case in test_cases[group_name]]
17 |     return header, values
18 | 
19 | 
20 | def parametrize(test_cases: dict, group_name: str):
21 |     return pytest.mark.parametrize(*get_table(test_cases, group_name))
22 | 
23 | 
24 | extract = read_yaml('tests/cases/extract.yml')['tests']
25 | tlds = read_yaml('tests/cases/tlds.yml')['tests']
26 | validate = read_yaml('tests/cases/validate.yml')['tests']
27 | 
28 | 
29 | @parametrize(extract, 'tco_urls_with_params')
30 | def test_extract_tco_urls_with_params(description: str, text: str, expected: List[str]):
31 |     assert extract_urls(text) == expected
32 | 
33 | 
34 | @parametrize(extract, 'urls')
35 | def test_extract_urls(description: str, text: str, expected: List[str]):
36 |     assert extract_urls(text) == expected
37 | 
38 | 
39 | @parametrize(tlds, 'country')
40 | def test_tlds_country(description: str, text: str, expected: List[str]):
41 |     assert extract_urls(text) == expected
42 | 
43 | 
44 | @parametrize(extract, 'urls_with_indices')
45 | def test_extract_urls_with_indices(description: str, text: str, expected: dict):
46 |     assert extract_urls_with_indices(text) == expected
47 | 
48 | 
49 | @parametrize(extract, 'urls_with_directional_markers')
50 | def test_extract_urls_with_directional_markers(description: str, text: str, expected: dict):
51 |     assert extract_urls_with_indices(text) == expected
52 | 
53 | 
54 | @parametrize(validate, 'WeightedTweetsWithDiscountedEmojiCounterTest')
55 | def test_validate_weighted_tweets_with_discounted_emoji_counter_test(description: str, text: str, expected: dict):
56 |     assert parse_tweet(text).asdict() == expected
57 | 
58 | 
59 | @parametrize(validate, 'UnicodeDirectionalMarkerCounterTest')
60 | def test_validate_unicode_directional_marker_counter_test(description: str, text: str, expected: dict):
61 |     assert parse_tweet(text).asdict() == expected
62 | 


--------------------------------------------------------------------------------
/tests/cases/added.yml:
--------------------------------------------------------------------------------
 1 | tests:
 2 |   ExtractUrlsWithIndices:
 3 |     - description: "t.co URL immediately followed by another t.co URL"
 4 |       text: "https://t.co/slug/https://t.co/slug"
 5 |       expected:
 6 |         - url: "https://t.co/slug"
 7 |           indices: [0, 17]
 8 |   ParseTweet:
 9 |     - description: "CRLF character"
10 |       text: "a\r\nb"
11 |       expected:
12 |         weightedLength: 3
13 |         valid: true
14 |         permillage: 10
15 |         displayRangeStart: 0
16 |         displayRangeEnd: 3
17 |         validRangeStart: 0
18 |         validRangeEnd: 3
19 |     - description: "A URL containing emojis"
20 |       text: "https://😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷.jp"
21 |       expected:
22 |         weightedLength: 23
23 |         valid: true
24 |         permillage: 82
25 |         displayRangeStart: 0
26 |         displayRangeEnd: 62
27 |         validRangeStart: 0
28 |         validRangeEnd: 62
29 |     - description: "Hangul syllables such as gag (which may be a single character, or a sequence of conjoining jamos)"
30 |       text: "각각"
31 |       expected:
32 |         weightedLength: 4
33 |         valid: true
34 |         permillage: 14
35 |         displayRangeStart: 0
36 |         displayRangeEnd: 5
37 |         validRangeStart: 0
38 |         validRangeEnd: 5
39 |     - description: "One grapheme cluster composed of two Unicode code points (in Normalized Form C)"
40 |       text: "\u1E9B\u0323"
41 |       expected:
42 |         weightedLength: 3
43 |         valid: true
44 |         permillage: 10
45 |         displayRangeStart: 0
46 |         displayRangeEnd: 1
47 |         validRangeStart: 0
48 |         validRangeEnd: 1
49 |   ExtendedGraphemeClusters:
50 |     - description: "Tamil 'ni'"
51 |       text: "நிநி"
52 |       expected:
53 |         weightedLength: 4
54 |         valid: true
55 |         permillage: 14
56 |         displayRangeStart: 0
57 |         displayRangeEnd: 3
58 |         validRangeStart: 0
59 |         validRangeEnd: 3
60 |     - description: "Thai 'e'"
61 |       text: "เเ"
62 |       expected:
63 |         weightedLength: 2
64 |         valid: true
65 |         permillage: 7
66 |         displayRangeStart: 0
67 |         displayRangeEnd: 1
68 |         validRangeStart: 0
69 |         validRangeEnd: 1
70 |     - description: "Devanagari letter 'ssi'"
71 |       text: "षिषि"
72 |       expected:
73 |         weightedLength: 4
74 |         valid: true
75 |         permillage: 14
76 |         displayRangeStart: 0
77 |         displayRangeEnd: 3
78 |         validRangeStart: 0
79 |         validRangeEnd: 3
80 |     - description: "Thai 'kam'"
81 |       text: "กำกำ"
82 |       expected:
83 |         weightedLength: 4
84 |         valid: true
85 |         permillage: 14
86 |         displayRangeStart: 0
87 |         displayRangeEnd: 3
88 |         validRangeStart: 0
89 |         validRangeEnd: 3
90 | 


--------------------------------------------------------------------------------
/twitter_text/config.py:
--------------------------------------------------------------------------------
  1 | config = {
  2 |     "version1": {
  3 |         "version": 1,
  4 |         "max_weighted_tweet_length": 140,
  5 |         "scale": 1,
  6 |         "default_weight": 1,
  7 |         "transformed_url_length": 23,
  8 |         "ranges": []
  9 |     },
 10 |     "version2": {
 11 |         "version": 2,
 12 |         "max_weighted_tweet_length": 280,
 13 |         "scale": 100,
 14 |         "default_weight": 200,
 15 |         "transformed_url_length": 23,
 16 |         "ranges": [
 17 |             {
 18 |                 "start": 0,
 19 |                 "end": 4351,
 20 |                 "weight": 100
 21 |             },
 22 |             {
 23 |                 "start": 8192,
 24 |                 "end": 8205,
 25 |                 "weight": 100
 26 |             },
 27 |             {
 28 |                 "start": 8208,
 29 |                 "end": 8223,
 30 |                 "weight": 100
 31 |             },
 32 |             {
 33 |                 "start": 8242,
 34 |                 "end": 8247,
 35 |                 "weight": 100
 36 |             }
 37 |         ]
 38 |     },
 39 |     "version3": {
 40 |         "version": 3,
 41 |         "max_weighted_tweet_length": 280,
 42 |         "scale": 100,
 43 |         "default_weight": 200,
 44 |         "emoji_parsing_enabled": True,
 45 |         "transformed_url_length": 23,
 46 |         "ranges": [
 47 |             {
 48 |                 "start": 0,
 49 |                 "end": 4351,
 50 |                 "weight": 100
 51 |             },
 52 |             {
 53 |                 "start": 8192,
 54 |                 "end": 8205,
 55 |                 "weight": 100
 56 |             },
 57 |             {
 58 |                 "start": 8208,
 59 |                 "end": 8223,
 60 |                 "weight": 100
 61 |             },
 62 |             {
 63 |                 "start": 8242,
 64 |                 "end": 8247,
 65 |                 "weight": 100
 66 |             }
 67 |         ]
 68 |     },
 69 |     "defaults": {
 70 |         "version": 3,
 71 |         "max_weighted_tweet_length": 280,
 72 |         "scale": 100,
 73 |         "default_weight": 200,
 74 |         "emoji_parsing_enabled": True,
 75 |         "transformed_url_length": 23,
 76 |         "ranges": [
 77 |             {
 78 |                 "start": 0,
 79 |                 "end": 4351,
 80 |                 "weight": 100
 81 |             },
 82 |             {
 83 |                 "start": 8192,
 84 |                 "end": 8205,
 85 |                 "weight": 100
 86 |             },
 87 |             {
 88 |                 "start": 8208,
 89 |                 "end": 8223,
 90 |                 "weight": 100
 91 |             },
 92 |             {
 93 |                 "start": 8242,
 94 |                 "end": 8247,
 95 |                 "weight": 100
 96 |             }
 97 |         ]
 98 |     }
 99 | }
100 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | twitter-text-python
 2 | ===================
 3 | 
 4 | .. image:: https://readthedocs.org/projects/twitter-text-python/badge/?version=latest
 5 |     :target: https://twitter-text-python.readthedocs.io/en/latest/?badge=latest
 6 |     :alt: Documentation Status
 7 | 
 8 | .. image:: https://github.com/swen128/twitter-text-python/actions/workflows/test.yml/badge.svg
 9 |     :target: https://github.com/swen128/twitter-text-python/actions/workflows/test.yml
10 | 
11 | .. image:: https://github.com/swen128/twitter-text-python/actions/workflows/release.yml/badge.svg
12 |     :target: https://github.com/swen128/twitter-text-python/actions/workflows/release.yml
13 | 
14 | This is a Python port of the `twitter/twitter-text`_ libraries, fully compliant with the `official conformance test suite`_.
15 | 
16 | 
17 | Features
18 | ========
19 | 
20 | This library calculates length of a tweet message according to `the documentation from Twitter Developers`_,
21 | so that you can validate the tweet without calling the Web API at all.
22 | Although counting characters might seem an easy task, in actual fact it is very complicated, especially when the text contains CJK characters, URLs, or emojis.
23 | 
24 | The original twitter-text libraries have *hit-highlighting* and *auto-linking* features as well,
25 | however they are not yet supported by this Python port.
26 | 
27 | 
28 | Usage
29 | =====
30 | 
31 | Installation
32 | ------------
33 | 
34 | .. code-block:: console
35 | 
36 |     $ pip install twitter-text-parser
37 | 
38 | 
39 | Examples
40 | --------
41 | 
42 | See `the API reference <https://twitter-text-python.readthedocs.io/#module-twitter_text>`_ for more details.
43 | 
44 | .. code-block:: python
45 | 
46 |     from twitter_text import parse_tweet, extract_emojis_with_indices, extract_urls_with_indices
47 | 
48 |     text = 'english text 日本語 😷 https://example.com'
49 | 
50 |     assert parse_tweet(text).asdict() == {
51 |         'weightedLength': 46,
52 |         'valid': True,
53 |         'permillage': 164,
54 |         'validRangeStart': 0,
55 |         'validRangeEnd': 38,
56 |         'displayRangeStart': 0,
57 |         'displayRangeEnd': 38
58 |     }
59 | 
60 |     assert extract_urls_with_indices(text) == [{
61 |         'url': 'https://example.com',
62 |         'indices': [19, 38]
63 |     }]
64 | 
65 |     assert extract_emojis_with_indices(text) == [{
66 |         'emoji': '😷',
67 |         'indices': [17, 18]
68 |     }]
69 | 
70 | 
71 | Related Links
72 | =============
73 | 
74 | - `twitter/twitter-text`_: The original, official twitter-text implementations for Java, Ruby, JavaScript and Objective-C
75 | - `twitter-text Parser -- Twitter Developers`_: A brief overview of the twitter-text libraries
76 | - `Counting characters -- Twitter Developers`_: An introduction to how to count characters in Twitter texts
77 | - `edmondburnett/twitter-text-python`_: Another python port of twitter-text, which is not compliant with the `official conformance test suite`_
78 | 
79 | 
80 | .. _twitter/twitter-text: https://github.com/twitter/twitter-text
81 | .. _edmondburnett/twitter-text-python: https://github.com/edmondburnett/twitter-text-python
82 | .. _official conformance test suite: https://github.com/twitter/twitter-text/tree/master/conformance
83 | .. _search-api: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html
84 | .. _Counting characters -- Twitter Developers: https://developer.twitter.com/en/docs/basics/counting-characters.html
85 | .. _the documentation from Twitter Developers: https://developer.twitter.com/en/docs/developer-utilities/twitter-text
86 | .. _twitter-text Parser -- Twitter Developers: https://developer.twitter.com/en/docs/developer-utilities/twitter-text
87 | 


--------------------------------------------------------------------------------
/twitter_text/extract_urls.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | 
  3 | from .regexp.extract_url import extract_url
  4 | from .regexp.invalid_url_without_protocol_preceding_chars import invalid_url_without_protocol_preceding_chars
  5 | from .regexp.valid_ascii_domain import valid_ascii_domain
  6 | from .regexp.valid_tco_url import valid_tco_url
  7 | 
  8 | default_protocol = 'https://'
  9 | max_url_length = 4096
 10 | max_tco_slug_length = 40
 11 | 
 12 | 
 13 | def extract_urls(text: str, extract_urls_without_protocol: bool = True) -> List[str]:
 14 |     """
 15 |     Extract valid URLs present in ``text``.
 16 | 
 17 |     >>> extract_urls('http://twitter.com/これは日本語です。example.com中国語')
 18 |     ["url": "http://twitter.com/", "example.com"]
 19 |     """
 20 |     return [dic['url'] for dic in extract_urls_with_indices(text, extract_urls_without_protocol)]
 21 | 
 22 | 
 23 | def extract_urls_with_indices(text: str, extract_urls_without_protocol: bool = True) -> List[dict]:
 24 |     """
 25 |     Extract valid URLs present in ``text`` along with their Unicode code point indices.
 26 | 
 27 |     >>> extract_urls_with_indices('http://twitter.com/これは日本語です。example.com中国語')
 28 |     [
 29 |         {
 30 |             "url": "http://twitter.com/",
 31 |             "indices": [0, 19]
 32 |         },
 33 |         {
 34 |             "url": "example.com",
 35 |             "indices": [28, 39]
 36 |         }
 37 |     ]
 38 |     """
 39 |     if text == '' or ('.' not in text if extract_urls_without_protocol else ':' not in text):
 40 |         return []
 41 | 
 42 |     urls = []
 43 | 
 44 |     for url_match in extract_url.finditer(text):
 45 |         _, before, url, protocol, domain, _, path, _ = url_match.groups()
 46 |         end_position = url_match.end()
 47 |         start_position = end_position - len(url)
 48 | 
 49 |         if not is_valid_url(url, protocol or default_protocol, domain):
 50 |             continue
 51 | 
 52 |         # extract ASCII-only domains.
 53 |         if protocol is None:
 54 |             if not extract_urls_without_protocol or \
 55 |                     invalid_url_without_protocol_preceding_chars.match(before):
 56 |                 continue
 57 | 
 58 |             last_url = None
 59 |             for ascii_domain_match in valid_ascii_domain.finditer(domain):
 60 |                 ascii_domain = ascii_domain_match.group(0)
 61 |                 ascii_start_position = ascii_domain_match.start()
 62 |                 ascii_end_position = ascii_domain_match.end()
 63 |                 last_url = {
 64 |                     'url': ascii_domain,
 65 |                     'indices': [start_position + ascii_start_position, start_position + ascii_end_position]
 66 |                 }
 67 |                 urls.append(last_url)
 68 | 
 69 |             # no ASCII-only domain found. Skip the entire URL.
 70 |             if last_url is None:
 71 |                 continue
 72 | 
 73 |             # lastUrl only contains domain. Need to add path and query if they exist.
 74 |             if path:
 75 |                 last_url['url'] = url.replace(domain, last_url['url'])
 76 |                 last_url['indices'][1] = end_position
 77 |         else:
 78 |             # In the case of t.co URLs, don't allow additional path characters.
 79 |             tco_url_match = valid_tco_url.search(url)
 80 | 
 81 |             if tco_url_match:
 82 |                 tco_url_slug = tco_url_match.group(1)
 83 |                 if tco_url_slug and len(tco_url_slug) > max_tco_slug_length:
 84 |                     continue
 85 |                 else:
 86 |                     url = tco_url_match.group(0)
 87 |                     end_position = start_position + len(url)
 88 | 
 89 |             urls.append({
 90 |                 'url': url,
 91 |                 'indices': [start_position, end_position]
 92 |             })
 93 | 
 94 |     return urls
 95 | 
 96 | 
 97 | def is_valid_url(url: str, protocol: str, domain: str) -> bool:
 98 |     puny_encoded_domain = idna_to_ascii(domain)
 99 | 
100 |     if (not puny_encoded_domain) or len(puny_encoded_domain) == 0:
101 |         return False
102 |     else:
103 |         url_length = len(url) + len(puny_encoded_domain) - len(domain)
104 |         return len(protocol) + url_length <= max_url_length
105 | 
106 | 
107 | def idna_to_ascii(domain: str) -> Optional[str]:
108 |     """
109 |     Convert an Internationalized Domain Name (IDN) into a Punycode string.
110 |     Return `None` if the `domain` is invalid.
111 | 
112 |     >>> idna_to_ascii('日本語.jp')
113 |     'xn--wgv71a119e.jp'
114 |     """
115 |     try:
116 |         return domain.encode('idna').decode('ascii')
117 |     except Exception:
118 |         return None
119 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/emoji.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections import namedtuple
  3 | from typing import List, Tuple
  4 | 
  5 | import pkg_resources
  6 | 
  7 | Emoji = namedtuple('Emoji', ('group', 'sub_group', 'name', 'status', 'codepoint', 'emoji'))
  8 | 
  9 | 
 10 | def parse_emoji_list(text: str) -> List[Emoji]:
 11 |     emoji_entries = []
 12 | 
 13 |     for line in text.splitlines()[32:]:  # skip the explanation lines
 14 |         if line == '# Status Counts':  # the last line in the document
 15 |             break
 16 |         if 'subtotal:' in line:  # these are lines showing statistics about each group, not needed
 17 |             continue
 18 |         if not line:  # if it's a blank line
 19 |             continue
 20 |         if line.startswith('#'):  # these lines contain group and/or sub-group names
 21 |             if '# group:' in line:
 22 |                 group = line.split(':')[-1].strip()
 23 |             if '# subgroup:' in line:
 24 |                 subgroup = line.split(':')[-1].strip()
 25 |         if group == 'Component':  # skin tones, and hair types, skip, as mentioned above
 26 |             continue
 27 |         if re.search('^[0-9A-F]{3,}', line):  # if the line starts with a hexadecimal number (an emoji code point)
 28 |             # here we define all the elements that will go into emoji entries
 29 |             codepoint = line.split(';')[0].strip()  # in some cases it is one and in others multiple code points
 30 |             status = line.split(';')[-1].split()[0].strip()  # status: fully-qualified, minimally-qualified, unqualified
 31 |             if line[-1] == '#':
 32 |                 # The special case where the emoji is actually the hash sign "#". In this case manually assign the emoji
 33 |                 if 'fully-qualified' in line:
 34 |                     emoji = '#️⃣'
 35 |                 else:
 36 |                     emoji = '#⃣'  # they look the same, but are actually different
 37 |             else:  # the default case
 38 |                 emoji = line.split('#')[-1].split()[0].strip()  # the emoji character itself
 39 |             if line[-1] == '#':  # (the special case)
 40 |                 name = '#'
 41 |             else:  # extract the emoji name
 42 |                 name = '_'.join(line.split('#')[-1][1:].split()[1:]).replace('_', ' ')
 43 |             templine = Emoji(
 44 |                 codepoint=codepoint,
 45 |                 status=status,
 46 |                 emoji=emoji,
 47 |                 name=name,
 48 |                 group=group,
 49 |                 sub_group=subgroup)
 50 |             emoji_entries.append(templine)
 51 | 
 52 |     return emoji_entries
 53 | 
 54 | 
 55 | def regex_for_multi_codepoint_emojis(emoji_list: List[Emoji]) -> str:
 56 |     multi_codepoint_emoji = []
 57 | 
 58 |     for code in [c.codepoint.split() for c in emoji_list]:
 59 |         if len(code) > 1:
 60 |             # turn to a hexadecimal number zfilled to 8 zeros e.g: '\U0001F44D'
 61 |             hexified_codes = [r'\U' + x.zfill(8) for x in code]
 62 |             hexified_codes = ''.join(hexified_codes)  # join all hexadecimal components
 63 |             multi_codepoint_emoji.append(hexified_codes)
 64 | 
 65 |     # sorting by length in decreasing order is extremely important
 66 |     multi_codepoint_emoji_sorted = sorted(multi_codepoint_emoji, key=len, reverse=True)
 67 | 
 68 |     # join with a "|" to function as an "or" in the regex
 69 |     multi_codepoint_emoji_joined = '|'.join(multi_codepoint_emoji_sorted)
 70 | 
 71 |     return multi_codepoint_emoji_joined
 72 | 
 73 | 
 74 | def regex_for_single_codepoint_emojis(emoji_list: List[Emoji]) -> str:
 75 |     single_codepoint_emoji_raw = r''  # start with an empty raw string
 76 |     for code in single_codepoint_emoji_ranges:
 77 |         if code[0] == code[1]:  # in this case make it a single hexadecimal character
 78 |             temp_regex = r'\U' + hex(code[0])[2:].zfill(8)
 79 |             single_codepoint_emoji_raw += temp_regex
 80 |         else:
 81 |             # otherwise create a character range, joined by '-'
 82 |             temp_regex = '-'.join([r'\U' + hex(code[0])[2:].zfill(8), r'\U' + hex(code[1])[2:].zfill(8)])
 83 |             single_codepoint_emoji_raw += temp_regex
 84 | 
 85 | 
 86 | def get_ranges(nums: List[int]) -> List[Tuple[int, int]]:
 87 |     """Reduce a list of integers to tuples of local maximums and minimums.
 88 | 
 89 |     :param nums: List of integers.
 90 |     :return ranges: List of tuples showing local minimums and maximums
 91 |     """
 92 |     nums = sorted(nums)
 93 |     lows = [nums[0]]
 94 |     highs = []
 95 |     if nums[1] - nums[0] > 1:
 96 |         highs.append(nums[0])
 97 |     for i in range(1, len(nums) - 1):
 98 |         if (nums[i] - nums[i - 1]) > 1:
 99 |             lows.append(nums[i])
100 |         if (nums[i + 1] - nums[i]) > 1:
101 |             highs.append(nums[i])
102 |     highs.append(nums[-1])
103 |     if len(highs) > len(lows):
104 |         lows.append(highs[-1])
105 |     return [(l, h) for l, h in zip(lows, highs)]
106 | 
107 | 
108 | emoji_raw = pkg_resources.resource_string(__name__, 'emoji-test.txt').decode('utf-8')
109 | emoji_list = parse_emoji_list(emoji_raw)
110 | emoji_dict = {x.emoji: x for x in emoji_list}
111 | 
112 | multi_codepoint_emoji_joined = regex_for_multi_codepoint_emojis(emoji_list)
113 | 
114 | single_codepoint_emoji = []
115 | 
116 | for code in [c.codepoint.split() for c in emoji_list]:
117 |     if len(code) == 1:
118 |         single_codepoint_emoji.append(code[0])
119 | 
120 | single_codepoint_emoji_int = [int(x, base=16) for x in single_codepoint_emoji]
121 | single_codepoint_emoji_ranges = get_ranges(single_codepoint_emoji_int)
122 | 
123 | single_codepoint_emoji_raw = r''  # start with an empty raw string
124 | for code in single_codepoint_emoji_ranges:
125 |     if code[0] == code[1]:  # in this case make it a single hexadecimal character
126 |         temp_regex = r'\U' + hex(code[0])[2:].zfill(8)
127 |         single_codepoint_emoji_raw += temp_regex
128 |     else:
129 |         # otherwise create a character range, joined by '-'
130 |         temp_regex = '-'.join([r'\U' + hex(code[0])[2:].zfill(8), r'\U' + hex(code[1])[2:].zfill(8)])
131 |         single_codepoint_emoji_raw += temp_regex
132 | 
133 | emoji = re.compile(multi_codepoint_emoji_joined + '|' + r'[' + single_codepoint_emoji_raw + r']')
134 | 


--------------------------------------------------------------------------------
/twitter_text/parse_tweet.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import unicodedata
  3 | from dataclasses import dataclass, asdict
  4 | from math import floor
  5 | from typing import List, Dict
  6 | 
  7 | from .config import config
  8 | from .extract_emojis import extract_emojis_with_indices
  9 | from .extract_urls import extract_urls_with_indices
 10 | from .get_character_weight import get_character_weight
 11 | from .has_invalid_characters import has_invalid_characters
 12 | 
 13 | 
 14 | @dataclass(frozen=True)
 15 | class ParsedResult:
 16 |     valid: bool
 17 |     weightedLength: int
 18 |     permillage: int
 19 |     validRangeStart: int
 20 |     validRangeEnd: int
 21 |     displayRangeStart: int
 22 |     displayRangeEnd: int
 23 | 
 24 |     def asdict(self) -> dict:
 25 |         return asdict(self)
 26 | 
 27 | 
 28 | def convert_line_ending(string, to="\n"):
 29 |     return re.sub(r'\r\n|\r|\n', to, string)
 30 | 
 31 | 
 32 | def parse_tweet(text: str, options: dict = config['defaults']) -> ParsedResult:
 33 |     """
 34 |     Parse a Twitter text according to https://developer.twitter.com/en/docs/developer-utilities/twitter-text
 35 | 
 36 |     :param str text: A text to parse.
 37 |     :param dict options: Parameters for counting the weighted tweet length. This must have the following properties:
 38 | 
 39 |         max_weighted_tweet_length (int)
 40 |             Valid tweet messages must not exceed this weighted length.
 41 | 
 42 |         default_weight (int)
 43 |             Default weight to cover code points not defined in the ``ranges``.
 44 | 
 45 |         ranges (list of dict)
 46 |             A list of Unicode code point ranges, with a weight associated with each of these ranges.
 47 |             Each element of ``ranges`` must have the following attributes:
 48 | 
 49 |                 - start (int)
 50 |                 - end (int)
 51 |                 - weight (int)
 52 | 
 53 |         scale (int)
 54 |             The weights are divided by ``scale``.
 55 | 
 56 |         emoji_parsing_enabled (bool)
 57 |             When set to ``True``, it counts an emoji consisting of multiple Unicode code points as a single character,
 58 |             resulting in a visually intuitive weighted length.
 59 | 
 60 |         transformed_url_length (int)
 61 |             The default length assigned to all URLs.
 62 | 
 63 |     :return ParsedResult: An object having the following properties:
 64 | 
 65 |         weightedLength (int)
 66 |             The weighted length of the twitter text.
 67 | 
 68 |             Each Unicode character (or URL, emoji) in ``text`` is assigned an integer weight,
 69 |             which is summed over to calculate `weightedLength`.
 70 | 
 71 |         valid (bool)
 72 |             True if the ``text`` is valid, i.e.,
 73 | 
 74 |             - ``weightedLength <= max_weighted_tweet_length``
 75 |             - ``text`` does not contain invalid characters.
 76 | 
 77 |         permillage (int)
 78 |             Equals to ``weightedLength // max_weighted_tweet_length * 1000``.
 79 | 
 80 |         displayRangeStart (int)
 81 |             Always 0.
 82 | 
 83 |         displayRangeEnd (int)
 84 |             Number of UTF-16 code units in ``text``, subtracted by one.
 85 | 
 86 |         validRangeStart (int)
 87 |             Always 0.
 88 | 
 89 |         validRangeEnd (int)
 90 |             Number of UTF-16 code units in the valid part of ``text``, subtracted by one.
 91 | 
 92 |             The "valid part" here means the longest valid Unicode substring starting from the leftmost of ``text``.
 93 | 
 94 | 
 95 |     Example:
 96 | 
 97 |     >>> parse_tweet('english text 日本語 😷 https://example.com')
 98 |     ParsedResult(
 99 |         weightedLength=46,
100 |         valid=True,
101 |         permillage=164,
102 |         validRangeStart=0,
103 |         validRangeEnd=38,
104 |         displayRangeStart=0,
105 |         displayRangeEnd=38
106 |     )
107 |     """
108 |     scale = options['scale']
109 |     transformed_url_length = options['transformed_url_length']
110 |     default_weight = options['default_weight']
111 |     emoji_parsing_enabled = options['emoji_parsing_enabled']
112 |     max_weighted_tweet_length = options['max_weighted_tweet_length']
113 | 
114 |     normalized_text = convert_line_ending(unicodedata.normalize('NFC', text))
115 | 
116 |     url_entities_map = transform_entities_to_hash(extract_urls_with_indices(normalized_text))
117 |     emoji_entities_map = transform_entities_to_hash(extract_emojis_with_indices(normalized_text))
118 | 
119 |     weighted_length = 0
120 |     valid_display_index = 0
121 |     valid = True
122 |     char_index = 0
123 | 
124 |     while char_index < len(normalized_text):
125 |         if char_index in url_entities_map:
126 |             url = url_entities_map[char_index]['url']
127 |             weighted_length += transformed_url_length * scale
128 |             char_index += len(url) - 1
129 |         elif emoji_parsing_enabled and char_index in emoji_entities_map:
130 |             emoji = emoji_entities_map[char_index]['emoji']
131 |             weighted_length += default_weight
132 |             char_index += len(emoji) - 1
133 |         else:
134 |             weighted_length += get_character_weight(normalized_text[char_index], options)
135 | 
136 |         if valid:
137 |             valid = not has_invalid_characters(normalized_text[char_index:char_index + 1])
138 | 
139 |         if valid and weighted_length <= max_weighted_tweet_length * scale:
140 |             valid_display_index = char_index
141 | 
142 |         char_index += 1
143 | 
144 |     weighted_length = int(weighted_length / scale)
145 |     valid_display_offset = count_utf16_bytes(normalized_text[:valid_display_index + 1]) - 1
146 |     normalization_offset = count_utf16_bytes(text) - count_utf16_bytes(normalized_text)
147 | 
148 |     return ParsedResult(
149 |         weightedLength=weighted_length,
150 |         valid=valid and 0 < weighted_length <= max_weighted_tweet_length,
151 |         permillage=floor((weighted_length / max_weighted_tweet_length) * 1000),
152 |         validRangeStart=0,
153 |         validRangeEnd=valid_display_offset + normalization_offset,
154 |         displayRangeStart=0,
155 |         displayRangeEnd=count_utf16_bytes(text) - 1 if count_utf16_bytes(text) > 0 else 0
156 |     )
157 | 
158 | 
159 | def transform_entities_to_hash(entities: List[dict]) -> Dict[int, dict]:
160 |     return {entity['indices'][0]: entity for entity in entities}
161 | 
162 | 
163 | def count_utf16_bytes(text: str) -> int:
164 |     return len(text.encode('utf-16')) // 2 - 1
165 | 


--------------------------------------------------------------------------------
/twitter_text/regexp/valid_gtld.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | valid_gtld = re.compile(
 4 |     '(?:(?:' +
 5 |     '삼성|닷컴|닷넷|香格里拉|餐厅|食品|飞利浦|電訊盈科|集团|通販|购物|谷歌|诺基亚|联通|网络|网站|网店|网址|组织机构|移动|珠宝|点看|游戏|淡马锡|机构|書籍|时尚|新闻|' +
 6 |     '政府|政务|招聘|手表|手机|我爱你|慈善|微博|广东|工行|家電|娱乐|天主教|大拿|大众汽车|在线|嘉里大酒店|嘉里|商标|商店|商城|公益|公司|八卦|健康|信息|佛山|企业|' +
 7 |     '中文网|中信|世界|ポイント|ファッション|セール|ストア|コム|グーグル|クラウド|みんな|คอม|संगठन|नेट|कॉम|همراه|موقع|موبايلي|كوم|' +
 8 |     'كاثوليك|عرب|شبكة|بيتك|بازار|العليان|ارامكو|اتصالات|ابوظبي|קום|сайт|рус|орг|онлайн|москва|ком|' +
 9 |     'католик|дети|zuerich|zone|zippo|zip|zero|zara|zappos|yun|youtube|you|yokohama|yoga|yodobashi|' +
10 |     'yandex|yamaxun|yahoo|yachts|xyz|xxx|xperia|xin|xihuan|xfinity|xerox|xbox|wtf|wtc|wow|world|' +
11 |     'works|work|woodside|wolterskluwer|wme|winners|wine|windows|win|williamhill|wiki|wien|whoswho|' +
12 |     'weir|weibo|wedding|wed|website|weber|webcam|weatherchannel|weather|watches|watch|warman|' +
13 |     'wanggou|wang|walter|walmart|wales|vuelos|voyage|voto|voting|vote|volvo|volkswagen|vodka|' +
14 |     'vlaanderen|vivo|viva|vistaprint|vista|vision|visa|virgin|vip|vin|villas|viking|vig|video|' +
15 |     'viajes|vet|versicherung|vermögensberatung|vermögensberater|verisign|ventures|vegas|vanguard|' +
16 |     'vana|vacations|ups|uol|uno|university|unicom|uconnect|ubs|ubank|tvs|tushu|tunes|tui|tube|trv|' +
17 |     'trust|travelersinsurance|travelers|travelchannel|travel|training|trading|trade|toys|toyota|' +
18 |     'town|tours|total|toshiba|toray|top|tools|tokyo|today|tmall|tkmaxx|tjx|tjmaxx|tirol|tires|tips|' +
19 |     'tiffany|tienda|tickets|tiaa|theatre|theater|thd|teva|tennis|temasek|telefonica|telecity|tel|' +
20 |     'technology|tech|team|tdk|tci|taxi|tax|tattoo|tatar|tatamotors|target|taobao|talk|taipei|tab|' +
21 |     'systems|symantec|sydney|swiss|swiftcover|swatch|suzuki|surgery|surf|support|supply|supplies|' +
22 |     'sucks|style|study|studio|stream|store|storage|stockholm|stcgroup|stc|statoil|statefarm|' +
23 |     'statebank|starhub|star|staples|stada|srt|srl|spreadbetting|spot|sport|spiegel|space|soy|sony|' +
24 |     'song|solutions|solar|sohu|software|softbank|social|soccer|sncf|smile|smart|sling|skype|sky|' +
25 |     'skin|ski|site|singles|sina|silk|shriram|showtime|show|shouji|shopping|shop|shoes|shiksha|shia|' +
26 |     'shell|shaw|sharp|shangrila|sfr|sexy|sex|sew|seven|ses|services|sener|select|seek|security|' +
27 |     'secure|seat|search|scot|scor|scjohnson|science|schwarz|schule|school|scholarships|schmidt|' +
28 |     'schaeffler|scb|sca|sbs|sbi|saxo|save|sas|sarl|sapo|sap|sanofi|sandvikcoromant|sandvik|samsung|' +
29 |     'samsclub|salon|sale|sakura|safety|safe|saarland|ryukyu|rwe|run|ruhr|rugby|rsvp|room|rogers|' +
30 |     'rodeo|rocks|rocher|rmit|rip|rio|ril|rightathome|ricoh|richardli|rich|rexroth|reviews|review|' +
31 |     'restaurant|rest|republican|report|repair|rentals|rent|ren|reliance|reit|reisen|reise|rehab|' +
32 |     'redumbrella|redstone|red|recipes|realty|realtor|realestate|read|raid|radio|racing|qvc|quest|' +
33 |     'quebec|qpon|pwc|pub|prudential|pru|protection|property|properties|promo|progressive|prof|' +
34 |     'productions|prod|pro|prime|press|praxi|pramerica|post|porn|politie|poker|pohl|pnc|plus|' +
35 |     'plumbing|playstation|play|place|pizza|pioneer|pink|ping|pin|pid|pictures|pictet|pics|piaget|' +
36 |     'physio|photos|photography|photo|phone|philips|phd|pharmacy|pfizer|pet|pccw|pay|passagens|' +
37 |     'party|parts|partners|pars|paris|panerai|panasonic|pamperedchef|page|ovh|ott|otsuka|osaka|' +
38 |     'origins|orientexpress|organic|org|orange|oracle|open|ooo|onyourside|online|onl|ong|one|omega|' +
39 |     'ollo|oldnavy|olayangroup|olayan|okinawa|office|off|observer|obi|nyc|ntt|nrw|nra|nowtv|nowruz|' +
40 |     'now|norton|northwesternmutual|nokia|nissay|nissan|ninja|nikon|nike|nico|nhk|ngo|nfl|nexus|' +
41 |     'nextdirect|next|news|newholland|new|neustar|network|netflix|netbank|net|nec|nba|navy|natura|' +
42 |     'nationwide|name|nagoya|nadex|nab|mutuelle|mutual|museum|mtr|mtpc|mtn|msd|movistar|movie|mov|' +
43 |     'motorcycles|moto|moscow|mortgage|mormon|mopar|montblanc|monster|money|monash|mom|moi|moe|moda|' +
44 |     'mobily|mobile|mobi|mma|mls|mlb|mitsubishi|mit|mint|mini|mil|microsoft|miami|metlife|merckmsd|' +
45 |     'meo|menu|men|memorial|meme|melbourne|meet|media|med|mckinsey|mcdonalds|mcd|mba|mattel|' +
46 |     'maserati|marshalls|marriott|markets|marketing|market|map|mango|management|man|makeup|maison|' +
47 |     'maif|madrid|macys|luxury|luxe|lupin|lundbeck|ltda|ltd|lplfinancial|lpl|love|lotto|lotte|' +
48 |     'london|lol|loft|locus|locker|loans|loan|llp|llc|lixil|living|live|lipsy|link|linde|lincoln|' +
49 |     'limo|limited|lilly|like|lighting|lifestyle|lifeinsurance|life|lidl|liaison|lgbt|lexus|lego|' +
50 |     'legal|lefrak|leclerc|lease|lds|lawyer|law|latrobe|latino|lat|lasalle|lanxess|landrover|land|' +
51 |     'lancome|lancia|lancaster|lamer|lamborghini|ladbrokes|lacaixa|kyoto|kuokgroup|kred|krd|kpn|' +
52 |     'kpmg|kosher|komatsu|koeln|kiwi|kitchen|kindle|kinder|kim|kia|kfh|kerryproperties|' +
53 |     'kerrylogistics|kerryhotels|kddi|kaufen|juniper|juegos|jprs|jpmorgan|joy|jot|joburg|jobs|jnj|' +
54 |     'jmp|jll|jlc|jio|jewelry|jetzt|jeep|jcp|jcb|java|jaguar|iwc|iveco|itv|itau|istanbul|ist|' +
55 |     'ismaili|iselect|irish|ipiranga|investments|intuit|international|intel|int|insure|insurance|' +
56 |     'institute|ink|ing|info|infiniti|industries|inc|immobilien|immo|imdb|imamat|ikano|iinet|ifm|' +
57 |     'ieee|icu|ice|icbc|ibm|hyundai|hyatt|hughes|htc|hsbc|how|house|hotmail|hotels|hoteles|hot|' +
58 |     'hosting|host|hospital|horse|honeywell|honda|homesense|homes|homegoods|homedepot|holiday|' +
59 |     'holdings|hockey|hkt|hiv|hitachi|hisamitsu|hiphop|hgtv|hermes|here|helsinki|help|healthcare|' +
60 |     'health|hdfcbank|hdfc|hbo|haus|hangout|hamburg|hair|guru|guitars|guide|guge|gucci|guardian|' +
61 |     'group|grocery|gripe|green|gratis|graphics|grainger|gov|got|gop|google|goog|goodyear|goodhands|' +
62 |     'goo|golf|goldpoint|gold|godaddy|gmx|gmo|gmbh|gmail|globo|global|gle|glass|glade|giving|gives|' +
63 |     'gifts|gift|ggee|george|genting|gent|gea|gdn|gbiz|gay|garden|gap|games|game|gallup|gallo|' +
64 |     'gallery|gal|fyi|futbol|furniture|fund|fun|fujixerox|fujitsu|ftr|frontier|frontdoor|frogans|' +
65 |     'frl|fresenius|free|fox|foundation|forum|forsale|forex|ford|football|foodnetwork|food|foo|fly|' +
66 |     'flsmidth|flowers|florist|flir|flights|flickr|fitness|fit|fishing|fish|firmdale|firestone|fire|' +
67 |     'financial|finance|final|film|fido|fidelity|fiat|ferrero|ferrari|feedback|fedex|fast|fashion|' +
68 |     'farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|extraspace|express|exposed|expert|' +
69 |     'exchange|everbank|events|eus|eurovision|etisalat|esurance|estate|esq|erni|ericsson|equipment|' +
70 |     'epson|epost|enterprises|engineering|engineer|energy|emerck|email|education|edu|edeka|eco|eat|' +
71 |     'earth|dvr|dvag|durban|dupont|duns|dunlop|duck|dubai|dtv|drive|download|dot|doosan|domains|' +
72 |     'doha|dog|dodge|doctor|docs|dnp|diy|dish|discover|discount|directory|direct|digital|diet|' +
73 |     'diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|deloitte|dell|delivery|degree|' +
74 |     'deals|dealer|deal|dds|dclk|day|datsun|dating|date|data|dance|dad|dabur|cyou|cymru|cuisinella|' +
75 |     'csc|cruises|cruise|crs|crown|cricket|creditunion|creditcard|credit|cpa|courses|coupons|coupon|' +
76 |     'country|corsica|coop|cool|cookingchannel|cooking|contractors|contact|consulting|construction|' +
77 |     'condos|comsec|computer|compare|company|community|commbank|comcast|com|cologne|college|coffee|' +
78 |     'codes|coach|clubmed|club|cloud|clothing|clinique|clinic|click|cleaning|claims|cityeats|city|' +
79 |     'citic|citi|citadel|cisco|circle|cipriani|church|chrysler|chrome|christmas|chloe|chintai|cheap|' +
80 |     'chat|chase|charity|channel|chanel|cfd|cfa|cern|ceo|center|ceb|cbs|cbre|cbn|cba|catholic|' +
81 |     'catering|cat|casino|cash|caseih|case|casa|cartier|cars|careers|career|care|cards|caravan|car|' +
82 |     'capitalone|capital|capetown|canon|cancerresearch|camp|camera|cam|calvinklein|call|cal|cafe|' +
83 |     'cab|bzh|buzz|buy|business|builders|build|bugatti|budapest|brussels|brother|broker|broadway|' +
84 |     'bridgestone|bradesco|box|boutique|bot|boston|bostik|bosch|boots|booking|book|boo|bond|bom|' +
85 |     'bofa|boehringer|boats|bnpparibas|bnl|bmw|bms|blue|bloomberg|blog|blockbuster|blanco|' +
86 |     'blackfriday|black|biz|bio|bingo|bing|bike|bid|bible|bharti|bet|bestbuy|best|berlin|bentley|' +
87 |     'beer|beauty|beats|bcn|bcg|bbva|bbt|bbc|bayern|bauhaus|basketball|baseball|bargains|barefoot|' +
88 |     'barclays|barclaycard|barcelona|bar|bank|band|bananarepublic|banamex|baidu|baby|azure|axa|aws|' +
89 |     'avianca|autos|auto|author|auspost|audio|audible|audi|auction|attorney|athleta|associates|asia|' +
90 |     'asda|arte|art|arpa|army|archi|aramco|arab|aquarelle|apple|app|apartments|aol|anz|anquan|' +
91 |     'android|analytics|amsterdam|amica|amfam|amex|americanfamily|americanexpress|alstom|alsace|' +
92 |     'ally|allstate|allfinanz|alipay|alibaba|alfaromeo|akdn|airtel|airforce|airbus|aigo|aig|agency|' +
93 |     'agakhan|africa|afl|afamilycompany|aetna|aero|aeg|adult|ads|adac|actor|active|aco|accountants|' +
94 |     'accountant|accenture|academy|abudhabi|abogado|able|abc|abbvie|abbott|abb|abarth|aarp|aaa|' +
95 |     'onion' +
96 |     ')(?=[^0-9a-zA-Z@+-]|$))'
97 | )
98 | 


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand.
  2 | 
  3 | [[package]]
  4 | name = "alabaster"
  5 | version = "0.7.13"
  6 | description = "A configurable sidebar-enabled Sphinx theme"
  7 | optional = false
  8 | python-versions = ">=3.6"
  9 | files = [
 10 |     {file = "alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3"},
 11 |     {file = "alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2"},
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "babel"
 16 | version = "2.12.1"
 17 | description = "Internationalization utilities"
 18 | optional = false
 19 | python-versions = ">=3.7"
 20 | files = [
 21 |     {file = "Babel-2.12.1-py3-none-any.whl", hash = "sha256:b4246fb7677d3b98f501a39d43396d3cafdc8eadb045f4a31be01863f655c610"},
 22 |     {file = "Babel-2.12.1.tar.gz", hash = "sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455"},
 23 | ]
 24 | 
 25 | [package.dependencies]
 26 | pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""}
 27 | 
 28 | [[package]]
 29 | name = "certifi"
 30 | version = "2023.5.7"
 31 | description = "Python package for providing Mozilla's CA Bundle."
 32 | optional = false
 33 | python-versions = ">=3.6"
 34 | files = [
 35 |     {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"},
 36 |     {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"},
 37 | ]
 38 | 
 39 | [[package]]
 40 | name = "charset-normalizer"
 41 | version = "3.1.0"
 42 | description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 43 | optional = false
 44 | python-versions = ">=3.7.0"
 45 | files = [
 46 |     {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"},
 47 |     {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"},
 48 |     {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"},
 49 |     {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"},
 50 |     {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"},
 51 |     {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"},
 52 |     {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"},
 53 |     {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"},
 54 |     {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"},
 55 |     {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"},
 56 |     {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"},
 57 |     {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"},
 58 |     {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"},
 59 |     {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"},
 60 |     {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"},
 61 |     {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"},
 62 |     {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"},
 63 |     {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"},
 64 |     {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"},
 65 |     {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"},
 66 |     {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"},
 67 |     {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"},
 68 |     {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"},
 69 |     {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"},
 70 |     {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"},
 71 |     {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"},
 72 |     {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"},
 73 |     {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"},
 74 |     {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"},
 75 |     {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"},
 76 |     {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"},
 77 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"},
 78 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"},
 79 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"},
 80 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"},
 81 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"},
 82 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"},
 83 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"},
 84 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"},
 85 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"},
 86 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"},
 87 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"},
 88 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"},
 89 |     {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"},
 90 |     {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"},
 91 |     {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"},
 92 |     {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"},
 93 |     {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"},
 94 |     {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"},
 95 |     {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"},
 96 |     {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"},
 97 |     {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"},
 98 |     {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"},
 99 |     {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"},
100 |     {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"},
101 |     {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"},
102 |     {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"},
103 |     {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"},
104 |     {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"},
105 |     {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"},
106 |     {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"},
107 |     {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"},
108 |     {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"},
109 |     {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"},
110 |     {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"},
111 |     {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"},
112 |     {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"},
113 |     {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"},
114 |     {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"},
115 |     {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"},
116 |     {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"},
117 |     {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"},
118 |     {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"},
119 |     {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"},
120 |     {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"},
121 | ]
122 | 
123 | [[package]]
124 | name = "colorama"
125 | version = "0.4.6"
126 | description = "Cross-platform colored terminal text."
127 | optional = false
128 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
129 | files = [
130 |     {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
131 |     {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
132 | ]
133 | 
134 | [[package]]
135 | name = "docutils"
136 | version = "0.18.1"
137 | description = "Docutils -- Python Documentation Utilities"
138 | optional = false
139 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
140 | files = [
141 |     {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"},
142 |     {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"},
143 | ]
144 | 
145 | [[package]]
146 | name = "exceptiongroup"
147 | version = "1.1.1"
148 | description = "Backport of PEP 654 (exception groups)"
149 | optional = false
150 | python-versions = ">=3.7"
151 | files = [
152 |     {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"},
153 |     {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"},
154 | ]
155 | 
156 | [package.extras]
157 | test = ["pytest (>=6)"]
158 | 
159 | [[package]]
160 | name = "idna"
161 | version = "3.4"
162 | description = "Internationalized Domain Names in Applications (IDNA)"
163 | optional = false
164 | python-versions = ">=3.5"
165 | files = [
166 |     {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"},
167 |     {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"},
168 | ]
169 | 
170 | [[package]]
171 | name = "imagesize"
172 | version = "1.4.1"
173 | description = "Getting image size from png/jpeg/jpeg2000/gif file"
174 | optional = false
175 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
176 | files = [
177 |     {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"},
178 |     {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"},
179 | ]
180 | 
181 | [[package]]
182 | name = "importlib-metadata"
183 | version = "6.6.0"
184 | description = "Read metadata from Python packages"
185 | optional = false
186 | python-versions = ">=3.7"
187 | files = [
188 |     {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"},
189 |     {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"},
190 | ]
191 | 
192 | [package.dependencies]
193 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
194 | zipp = ">=0.5"
195 | 
196 | [package.extras]
197 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
198 | perf = ["ipython"]
199 | testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
200 | 
201 | [[package]]
202 | name = "iniconfig"
203 | version = "2.0.0"
204 | description = "brain-dead simple config-ini parsing"
205 | optional = false
206 | python-versions = ">=3.7"
207 | files = [
208 |     {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"},
209 |     {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"},
210 | ]
211 | 
212 | [[package]]
213 | name = "jinja2"
214 | version = "3.1.2"
215 | description = "A very fast and expressive template engine."
216 | optional = false
217 | python-versions = ">=3.7"
218 | files = [
219 |     {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
220 |     {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
221 | ]
222 | 
223 | [package.dependencies]
224 | MarkupSafe = ">=2.0"
225 | 
226 | [package.extras]
227 | i18n = ["Babel (>=2.7)"]
228 | 
229 | [[package]]
230 | name = "markupsafe"
231 | version = "2.1.2"
232 | description = "Safely add untrusted strings to HTML/XML markup."
233 | optional = false
234 | python-versions = ">=3.7"
235 | files = [
236 |     {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7"},
237 |     {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036"},
238 |     {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1"},
239 |     {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323"},
240 |     {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601"},
241 |     {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1"},
242 |     {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff"},
243 |     {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65"},
244 |     {file = "MarkupSafe-2.1.2-cp310-cp310-win32.whl", hash = "sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603"},
245 |     {file = "MarkupSafe-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156"},
246 |     {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013"},
247 |     {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a"},
248 |     {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd"},
249 |     {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6"},
250 |     {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d"},
251 |     {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1"},
252 |     {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc"},
253 |     {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0"},
254 |     {file = "MarkupSafe-2.1.2-cp311-cp311-win32.whl", hash = "sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625"},
255 |     {file = "MarkupSafe-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3"},
256 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a"},
257 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a"},
258 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a"},
259 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2"},
260 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619"},
261 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513"},
262 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460"},
263 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-win32.whl", hash = "sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859"},
264 |     {file = "MarkupSafe-2.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666"},
265 |     {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed"},
266 |     {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094"},
267 |     {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54"},
268 |     {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419"},
269 |     {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa"},
270 |     {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58"},
271 |     {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba"},
272 |     {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03"},
273 |     {file = "MarkupSafe-2.1.2-cp38-cp38-win32.whl", hash = "sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2"},
274 |     {file = "MarkupSafe-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147"},
275 |     {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f"},
276 |     {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd"},
277 |     {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f"},
278 |     {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4"},
279 |     {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2"},
280 |     {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65"},
281 |     {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c"},
282 |     {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3"},
283 |     {file = "MarkupSafe-2.1.2-cp39-cp39-win32.whl", hash = "sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7"},
284 |     {file = "MarkupSafe-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed"},
285 |     {file = "MarkupSafe-2.1.2.tar.gz", hash = "sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d"},
286 | ]
287 | 
288 | [[package]]
289 | name = "packaging"
290 | version = "23.1"
291 | description = "Core utilities for Python packages"
292 | optional = false
293 | python-versions = ">=3.7"
294 | files = [
295 |     {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"},
296 |     {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
297 | ]
298 | 
299 | [[package]]
300 | name = "pluggy"
301 | version = "1.0.0"
302 | description = "plugin and hook calling mechanisms for python"
303 | optional = false
304 | python-versions = ">=3.6"
305 | files = [
306 |     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
307 |     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
308 | ]
309 | 
310 | [package.dependencies]
311 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
312 | 
313 | [package.extras]
314 | dev = ["pre-commit", "tox"]
315 | testing = ["pytest", "pytest-benchmark"]
316 | 
317 | [[package]]
318 | name = "pygments"
319 | version = "2.15.1"
320 | description = "Pygments is a syntax highlighting package written in Python."
321 | optional = false
322 | python-versions = ">=3.7"
323 | files = [
324 |     {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"},
325 |     {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"},
326 | ]
327 | 
328 | [package.extras]
329 | plugins = ["importlib-metadata"]
330 | 
331 | [[package]]
332 | name = "pytest"
333 | version = "7.3.1"
334 | description = "pytest: simple powerful testing with Python"
335 | optional = false
336 | python-versions = ">=3.7"
337 | files = [
338 |     {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
339 |     {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
340 | ]
341 | 
342 | [package.dependencies]
343 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
344 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
345 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
346 | iniconfig = "*"
347 | packaging = "*"
348 | pluggy = ">=0.12,<2.0"
349 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
350 | 
351 | [package.extras]
352 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
353 | 
354 | [[package]]
355 | name = "pytz"
356 | version = "2023.3"
357 | description = "World timezone definitions, modern and historical"
358 | optional = false
359 | python-versions = "*"
360 | files = [
361 |     {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"},
362 |     {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
363 | ]
364 | 
365 | [[package]]
366 | name = "pyyaml"
367 | version = "6.0"
368 | description = "YAML parser and emitter for Python"
369 | optional = false
370 | python-versions = ">=3.6"
371 | files = [
372 |     {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
373 |     {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
374 |     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
375 |     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
376 |     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
377 |     {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
378 |     {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
379 |     {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
380 |     {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
381 |     {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
382 |     {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
383 |     {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
384 |     {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
385 |     {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
386 |     {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
387 |     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
388 |     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
389 |     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
390 |     {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
391 |     {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
392 |     {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
393 |     {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
394 |     {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
395 |     {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
396 |     {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
397 |     {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
398 |     {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
399 |     {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
400 |     {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
401 |     {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
402 |     {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
403 |     {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
404 |     {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
405 |     {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
406 |     {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
407 |     {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
408 |     {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
409 |     {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
410 |     {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
411 |     {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
412 | ]
413 | 
414 | [[package]]
415 | name = "requests"
416 | version = "2.31.0"
417 | description = "Python HTTP for Humans."
418 | optional = false
419 | python-versions = ">=3.7"
420 | files = [
421 |     {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
422 |     {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
423 | ]
424 | 
425 | [package.dependencies]
426 | certifi = ">=2017.4.17"
427 | charset-normalizer = ">=2,<4"
428 | idna = ">=2.5,<4"
429 | urllib3 = ">=1.21.1,<3"
430 | 
431 | [package.extras]
432 | socks = ["PySocks (>=1.5.6,!=1.5.7)"]
433 | use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
434 | 
435 | [[package]]
436 | name = "snowballstemmer"
437 | version = "2.2.0"
438 | description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms."
439 | optional = false
440 | python-versions = "*"
441 | files = [
442 |     {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"},
443 |     {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"},
444 | ]
445 | 
446 | [[package]]
447 | name = "sphinx"
448 | version = "6.2.1"
449 | description = "Python documentation generator"
450 | optional = false
451 | python-versions = ">=3.8"
452 | files = [
453 |     {file = "Sphinx-6.2.1.tar.gz", hash = "sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b"},
454 |     {file = "sphinx-6.2.1-py3-none-any.whl", hash = "sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912"},
455 | ]
456 | 
457 | [package.dependencies]
458 | alabaster = ">=0.7,<0.8"
459 | babel = ">=2.9"
460 | colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
461 | docutils = ">=0.18.1,<0.20"
462 | imagesize = ">=1.3"
463 | importlib-metadata = {version = ">=4.8", markers = "python_version < \"3.10\""}
464 | Jinja2 = ">=3.0"
465 | packaging = ">=21.0"
466 | Pygments = ">=2.13"
467 | requests = ">=2.25.0"
468 | snowballstemmer = ">=2.0"
469 | sphinxcontrib-applehelp = "*"
470 | sphinxcontrib-devhelp = "*"
471 | sphinxcontrib-htmlhelp = ">=2.0.0"
472 | sphinxcontrib-jsmath = "*"
473 | sphinxcontrib-qthelp = "*"
474 | sphinxcontrib-serializinghtml = ">=1.1.5"
475 | 
476 | [package.extras]
477 | docs = ["sphinxcontrib-websupport"]
478 | lint = ["docutils-stubs", "flake8 (>=3.5.0)", "flake8-simplify", "isort", "mypy (>=0.990)", "ruff", "sphinx-lint", "types-requests"]
479 | test = ["cython", "filelock", "html5lib", "pytest (>=4.6)"]
480 | 
481 | [[package]]
482 | name = "sphinx-rtd-theme"
483 | version = "1.2.1"
484 | description = "Read the Docs theme for Sphinx"
485 | optional = false
486 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7"
487 | files = [
488 |     {file = "sphinx_rtd_theme-1.2.1-py2.py3-none-any.whl", hash = "sha256:2cc9351176cbf91944ce44cefd4fab6c3b76ac53aa9e15d6db45a3229ad7f866"},
489 |     {file = "sphinx_rtd_theme-1.2.1.tar.gz", hash = "sha256:cf9a7dc0352cf179c538891cb28d6fad6391117d4e21c891776ab41dd6c8ff70"},
490 | ]
491 | 
492 | [package.dependencies]
493 | docutils = "<0.19"
494 | sphinx = ">=1.6,<7"
495 | sphinxcontrib-jquery = {version = ">=2.0.0,<3.0.0 || >3.0.0", markers = "python_version > \"3\""}
496 | 
497 | [package.extras]
498 | dev = ["bump2version", "sphinxcontrib-httpdomain", "transifex-client", "wheel"]
499 | 
500 | [[package]]
501 | name = "sphinxcontrib-applehelp"
502 | version = "1.0.4"
503 | description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books"
504 | optional = false
505 | python-versions = ">=3.8"
506 | files = [
507 |     {file = "sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e"},
508 |     {file = "sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228"},
509 | ]
510 | 
511 | [package.extras]
512 | lint = ["docutils-stubs", "flake8", "mypy"]
513 | test = ["pytest"]
514 | 
515 | [[package]]
516 | name = "sphinxcontrib-devhelp"
517 | version = "1.0.2"
518 | description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document."
519 | optional = false
520 | python-versions = ">=3.5"
521 | files = [
522 |     {file = "sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"},
523 |     {file = "sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e"},
524 | ]
525 | 
526 | [package.extras]
527 | lint = ["docutils-stubs", "flake8", "mypy"]
528 | test = ["pytest"]
529 | 
530 | [[package]]
531 | name = "sphinxcontrib-htmlhelp"
532 | version = "2.0.1"
533 | description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files"
534 | optional = false
535 | python-versions = ">=3.8"
536 | files = [
537 |     {file = "sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff"},
538 |     {file = "sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903"},
539 | ]
540 | 
541 | [package.extras]
542 | lint = ["docutils-stubs", "flake8", "mypy"]
543 | test = ["html5lib", "pytest"]
544 | 
545 | [[package]]
546 | name = "sphinxcontrib-jquery"
547 | version = "4.1"
548 | description = "Extension to include jQuery on newer Sphinx releases"
549 | optional = false
550 | python-versions = ">=2.7"
551 | files = [
552 |     {file = "sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a"},
553 |     {file = "sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"},
554 | ]
555 | 
556 | [package.dependencies]
557 | Sphinx = ">=1.8"
558 | 
559 | [[package]]
560 | name = "sphinxcontrib-jsmath"
561 | version = "1.0.1"
562 | description = "A sphinx extension which renders display math in HTML via JavaScript"
563 | optional = false
564 | python-versions = ">=3.5"
565 | files = [
566 |     {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"},
567 |     {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"},
568 | ]
569 | 
570 | [package.extras]
571 | test = ["flake8", "mypy", "pytest"]
572 | 
573 | [[package]]
574 | name = "sphinxcontrib-qthelp"
575 | version = "1.0.3"
576 | description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document."
577 | optional = false
578 | python-versions = ">=3.5"
579 | files = [
580 |     {file = "sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72"},
581 |     {file = "sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"},
582 | ]
583 | 
584 | [package.extras]
585 | lint = ["docutils-stubs", "flake8", "mypy"]
586 | test = ["pytest"]
587 | 
588 | [[package]]
589 | name = "sphinxcontrib-serializinghtml"
590 | version = "1.1.5"
591 | description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)."
592 | optional = false
593 | python-versions = ">=3.5"
594 | files = [
595 |     {file = "sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"},
596 |     {file = "sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd"},
597 | ]
598 | 
599 | [package.extras]
600 | lint = ["docutils-stubs", "flake8", "mypy"]
601 | test = ["pytest"]
602 | 
603 | [[package]]
604 | name = "tomli"
605 | version = "2.0.1"
606 | description = "A lil' TOML parser"
607 | optional = false
608 | python-versions = ">=3.7"
609 | files = [
610 |     {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
611 |     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
612 | ]
613 | 
614 | [[package]]
615 | name = "typing-extensions"
616 | version = "4.6.1"
617 | description = "Backported and Experimental Type Hints for Python 3.7+"
618 | optional = false
619 | python-versions = ">=3.7"
620 | files = [
621 |     {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"},
622 |     {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"},
623 | ]
624 | 
625 | [[package]]
626 | name = "urllib3"
627 | version = "2.0.2"
628 | description = "HTTP library with thread-safe connection pooling, file post, and more."
629 | optional = false
630 | python-versions = ">=3.7"
631 | files = [
632 |     {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"},
633 |     {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"},
634 | ]
635 | 
636 | [package.extras]
637 | brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
638 | secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"]
639 | socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
640 | zstd = ["zstandard (>=0.18.0)"]
641 | 
642 | [[package]]
643 | name = "zipp"
644 | version = "3.15.0"
645 | description = "Backport of pathlib-compatible object wrapper for zip files"
646 | optional = false
647 | python-versions = ">=3.7"
648 | files = [
649 |     {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"},
650 |     {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"},
651 | ]
652 | 
653 | [package.extras]
654 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
655 | testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
656 | 
657 | [metadata]
658 | lock-version = "2.0"
659 | python-versions = "^3.7"
660 | content-hash = "82060e113dafb63ffa0ca0a9ea5b6070c9771ceb1dd3727757730f14aaaab96b"
661 | 


--------------------------------------------------------------------------------
/tests/cases/extract.yml:
--------------------------------------------------------------------------------
   1 | tests:
   2 |   mentions:
   3 |     - description: "Extract mention at the begining of a tweet"
   4 |       text: "@username reply"
   5 |       expected: ["username"]
   6 | 
   7 |     - description: "Extract mention at the end of a tweet"
   8 |       text: "mention @username"
   9 |       expected: ["username"]
  10 | 
  11 |     - description: "Extract mention in the middle of a tweet"
  12 |       text: "mention @username in the middle"
  13 |       expected: ["username"]
  14 | 
  15 |     - description: "Extract mention of username with underscore"
  16 |       text: "mention @user_name"
  17 |       expected: ["user_name"]
  18 | 
  19 |     - description: "Extract mention of all numeric username"
  20 |       text: "mention @12345"
  21 |       expected: ["12345"]
  22 | 
  23 |     - description: "Extract mention or multiple usernames"
  24 |       text: "mention @username1 @username2"
  25 |       expected: ["username1", "username2"]
  26 | 
  27 |     - description: "Extract mention in the middle of a Japanese tweet"
  28 |       text: "の@usernameに到着を待っている"
  29 |       expected: ["username"]
  30 | 
  31 |     - description: "DO NOT extract username ending in @"
  32 |       text: "Current Status: @_@ (cc: @username)"
  33 |       expected: ["username"]
  34 | 
  35 |     - description: "DO NOT extract username followed by accented latin characters"
  36 |       text: "@aliceìnheiro something something"
  37 |       expected: []
  38 | 
  39 |     - description: "Extract lone metion but not @user@user (too close to an email)"
  40 |       text: "@username email me @test@example.com"
  41 |       expected: ["username"]
  42 | 
  43 |     - description: "DO NOT extract 'http' in '@http://' as username"
  44 |       text: "@http://twitter.com"
  45 |       expected: []
  46 | 
  47 |     - description: "Extract mentions before newline"
  48 |       text: "@username\n@mention"
  49 |       expected: ["username", "mention"]
  50 | 
  51 |     - description: "Extract mentions after 'RT'"
  52 |       text: "RT@username RT:@mention RT @test"
  53 |       expected: ["username", "mention", "test"]
  54 | 
  55 |     - description: "Extract mentions after 'rt'"
  56 |       text: "rt@username rt:@mention rt @test"
  57 |       expected: ["username", "mention", "test"]
  58 | 
  59 |     - description: "Extract mentions after 'Rt'"
  60 |       text: "Rt@username Rt:@mention Rt @test"
  61 |       expected: ["username", "mention", "test"]
  62 | 
  63 |     - description: "Extract mentions after 'rT'"
  64 |       text: "rT@username rT:@mention rT @test"
  65 |       expected: ["username", "mention", "test"]
  66 | 
  67 |     - description: "DO NOT extract username preceded by !"
  68 |       text: "f!@kn"
  69 |       expected: []
  70 | 
  71 |     - description: "DO NOT extract username preceded by @"
  72 |       text: "f@@kn"
  73 |       expected: []
  74 | 
  75 |     - description: "DO NOT extract username preceded by #"
  76 |       text: "f#@kn"
  77 |       expected: []
  78 | 
  79 |     - description: "DO NOT extract username preceded by $"
  80 |       text: "f$@kn"
  81 |       expected: []
  82 | 
  83 |     - description: "DO NOT extract username preceded by %"
  84 |       text: "f%@kn"
  85 |       expected: []
  86 | 
  87 |     - description: "DO NOT extract username preceded by &"
  88 |       text: "f&@kn"
  89 |       expected: []
  90 | 
  91 |     - description: "DO NOT extract username preceded by *"
  92 |       text: "f*@kn"
  93 |       expected: []
  94 | 
  95 |   mentions_with_indices:
  96 |     - description: "Extract a mention at the start"
  97 |       text: "@username yo!"
  98 |       expected:
  99 |         - screen_name: "username"
 100 |           indices: [0, 9]
 101 | 
 102 |     - description: "Extract a mention that has the same thing mentioned at the start"
 103 |       text: "username @username"
 104 |       expected:
 105 |         - screen_name: "username"
 106 |           indices: [9, 18]
 107 | 
 108 |     - description: "Extract a mention in the middle of a Japanese tweet"
 109 |       text: "の@usernameに到着を待っている"
 110 |       expected:
 111 |         - screen_name: "username"
 112 |           indices: [1, 10]
 113 | 
 114 |   mentions_or_lists_with_indices:
 115 |     - description: "Extract a mention"
 116 |       text: "@username yo!"
 117 |       expected:
 118 |         - screen_name: "username"
 119 |           list_slug: ""
 120 |           indices: [0, 9]
 121 | 
 122 |     - description: "Extract a list"
 123 |       text: "@username/list-name is a great list!"
 124 |       expected:
 125 |         - screen_name: "username"
 126 |           list_slug: "/list-name"
 127 |           indices: [0, 19]
 128 | 
 129 |     - description: "Extract a mention and list"
 130 |       text: "Hey @username, check out out @otheruser/list_name-01!"
 131 |       expected:
 132 |         - screen_name: "username"
 133 |           list_slug: ""
 134 |           indices: [4, 13]
 135 |         - screen_name: "otheruser"
 136 |           list_slug: "/list_name-01"
 137 |           indices: [29, 52]
 138 | 
 139 |     - description: "Extract a list in the middle of a Japanese tweet"
 140 |       text: "の@username/list_name-01に到着を待っている"
 141 |       expected:
 142 |         - screen_name: "username"
 143 |           list_slug: "/list_name-01"
 144 |           indices: [1, 23]
 145 | 
 146 |     - description: "DO NOT extract a list with slug that starts with a number"
 147 |       text: "@username/7list-name is a great list!"
 148 |       expected:
 149 |         - screen_name: "username"
 150 |           list_slug: ""
 151 |           indices: [0, 9]
 152 | 
 153 |   replies:
 154 |     - description: "Extract reply at the begining of a tweet"
 155 |       text: "@username reply"
 156 |       expected: "username"
 157 | 
 158 |     - description: "Extract reply preceded by only a space"
 159 |       text: " @username reply"
 160 |       expected: "username"
 161 | 
 162 |     - description: "Extract reply preceded by only a full-width space (U+3000)"
 163 |       text: "　@username reply"
 164 |       expected: "username"
 165 | 
 166 |     - description: "DO NOT Extract reply when preceded by text"
 167 |       text: "a @username mention, not a reply"
 168 |       expected:
 169 | 
 170 |     - description: "DO NOT Extract reply when preceded by ."
 171 |       text: ".@username mention, not a reply"
 172 |       expected:
 173 | 
 174 |     - description: "DO NOT Extract reply when preceded by /"
 175 |       text: "/@username mention, not a reply"
 176 |       expected:
 177 | 
 178 |     - description: "DO NOT Extract reply when preceded by _"
 179 |       text: "_@username mention, not a reply"
 180 |       expected:
 181 | 
 182 |     - description: "DO NOT Extract reply when preceded by -"
 183 |       text: "-@username mention, not a reply"
 184 |       expected:
 185 | 
 186 |     - description: "DO NOT Extract reply when preceded by +"
 187 |       text: "+@username mention, not a reply"
 188 |       expected:
 189 | 
 190 |     - description: "DO NOT Extract reply when preceded by #"
 191 |       text: "#@username mention, not a reply"
 192 |       expected:
 193 | 
 194 |     - description: "DO NOT Extract reply when preceded by !"
 195 |       text: "!@username mention, not a reply"
 196 |       expected:
 197 | 
 198 |     - description: "DO NOT Extract reply when preceded by @"
 199 |       text: "@@username mention, not a reply"
 200 |       expected:
 201 | 
 202 |     - description: "DO NOT Extract reply when followed by URL"
 203 |       text: "@http://twitter.com"
 204 |       expected:
 205 | 
 206 |   urls:
 207 |     - description: "Extract a lone URL"
 208 |       text: "http://example.com"
 209 |       expected: ["http://example.com"]
 210 | 
 211 |     - description: "Extract a lone unicode url"
 212 |       text: "http://ああ.com"
 213 |       expected: ["http://ああ.com"]
 214 | 
 215 |     - description: "Extract a lone unicode url with -"
 216 |       text: "http://あ-あ.com"
 217 |       expected: ["http://あ-あ.com"]
 218 | 
 219 |     - description: "Extract valid URL: http://google.com"
 220 |       text: "text http://google.com"
 221 |       expected: ["http://google.com"]
 222 | 
 223 |     - description: "Extract valid URL: http://foobar.com/#"
 224 |       text: "text http://foobar.com/#"
 225 |       expected: ["http://foobar.com/#"]
 226 | 
 227 |     - description: "Extract valid URL: http://google.com/#foo"
 228 |       text: "text http://google.com/#foo"
 229 |       expected: ["http://google.com/#foo"]
 230 | 
 231 |     - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks"
 232 |       text: "text http://google.com/#search?q=iphone%20-filter%3Alinks"
 233 |       expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"]
 234 | 
 235 |     - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks"
 236 |       text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks"
 237 |       expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"]
 238 | 
 239 |     - description: "Extract valid URL: http://somedomain.com/index.php?path=/abc/def/"
 240 |       text: "text http://somedomain.com/index.php?path=/abc/def/"
 241 |       expected: ["http://somedomain.com/index.php?path=/abc/def/"]
 242 | 
 243 |     - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
 244 |       text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"
 245 |       expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"]
 246 | 
 247 |     - description: "Extract valid URL: http://somehost.com:3000"
 248 |       text: "text http://somehost.com:3000"
 249 |       expected: ["http://somehost.com:3000"]
 250 | 
 251 |     - description: "Extract valid URL: http://xo.com/~matthew+%ff-x"
 252 |       text: "text http://xo.com/~matthew+%ff-x"
 253 |       expected: ["http://xo.com/~matthew+%ff-x"]
 254 | 
 255 |     - description: "Extract valid URL: http://xo.com/~matthew+%ff-,.;x"
 256 |       text: "text http://xo.com/~matthew+%ff-,.;x"
 257 |       expected: ["http://xo.com/~matthew+%ff-,.;x"]
 258 | 
 259 |     - description: "Extract valid URL: http://xo.com/,.;x"
 260 |       text: "text http://xo.com/,.;x"
 261 |       expected: ["http://xo.com/,.;x"]
 262 | 
 263 |     - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)"
 264 |       text: "text http://en.wikipedia.org/wiki/Primer_(film)"
 265 |       expected: ["http://en.wikipedia.org/wiki/Primer_(film)"]
 266 | 
 267 |     - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59"
 268 |       text: "text http://www.ams.org/bookstore-getitem/item=mbk-59"
 269 |       expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"]
 270 | 
 271 |     - description: "Extract valid URL: http://✪df.ws/ejp"
 272 |       text: "text http://✪df.ws/ejp"
 273 |       expected: ["http://✪df.ws/ejp"]
 274 | 
 275 |     - description: "Extract valid URL: http://example.com/"
 276 |       text: "test http://example.comだよね.comtest/hogehoge"
 277 |       expected: ["http://example.com"]
 278 | 
 279 |     - description: "Extract valid URL: http://chilp.it/?77e8fd"
 280 |       text: "text http://chilp.it/?77e8fd"
 281 |       expected: ["http://chilp.it/?77e8fd"]
 282 | 
 283 |     - description: "Extract valid URL: http://x.com/oneletterdomain"
 284 |       text: "text http://x.com/oneletterdomain"
 285 |       expected: ["http://x.com/oneletterdomain"]
 286 | 
 287 |     - description: "Extract valid URL: http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"
 288 |       text: "text http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"
 289 |       expected: ["http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"]
 290 | 
 291 |     - description: "Extract valid URL with hyphen as query ending char: https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"
 292 |       text: "text https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"
 293 |       expected: ["https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"]
 294 | 
 295 |     - description: "DO NOT extract invalid URL: http://no-tld"
 296 |       text: "text http://no-tld"
 297 |       expected: []
 298 | 
 299 |     - description: "DO NOT extract invalid URL: http://tld-too-short.x"
 300 |       text: "text http://tld-too-short.x"
 301 |       expected: []
 302 | 
 303 |     - description: "DO NOT extract invalid URL with invalid preceding character: (http://twitter.com"
 304 |       text: "(http://twitter.com"
 305 |       expected: ["http://twitter.com"]
 306 | 
 307 |     - description: "Extract a very long hyphenated sub-domain URL (single letter hyphens)"
 308 |       text: "text http://word-and-a-number-8-ftw.domain.com/"
 309 |       expected: ["http://word-and-a-number-8-ftw.domain.com/"]
 310 | 
 311 |     - description: "DO NOT Extract a hyphenated TLD (even though it's usually a typo)"
 312 |       text: "text http://domain.com-that-you-should-have-put-a-space-after"
 313 |       expected: []
 314 | 
 315 |     - description: "Extract URL ending with # value"
 316 |       text: "text http://foo.com?#foo text"
 317 |       expected: ["http://foo.com?#foo"]
 318 | 
 319 |     - description: "Extract URLs without protocol on (com|org|edu|gov|net) domains"
 320 |       text: "foo.com foo.net foo.org foo.edu foo.gov"
 321 |       expected: ["foo.com", "foo.net", "foo.org", "foo.edu", "foo.gov"]
 322 | 
 323 |     - description: "Extract URLs without protocol not on (com|org|edu|gov|net) domains"
 324 |       text: "foo.baz foo.co.jp www.xxxxxxx.baz www.foo.co.uk wwwww.xxxxxxx foo.comm foo.somecom foo.govedu foo.jp"
 325 |       expected: ["foo.co.jp", "www.foo.co.uk", "foo.jp"]
 326 | 
 327 |     - description: "Extract URLs without protocol on ccTLD with slash"
 328 |       text: "t.co/abcde bit.ly/abcde"
 329 |       expected: ["t.co/abcde", "bit.ly/abcde"]
 330 | 
 331 |     - description: "Extract URLs with protocol on ccTLD domains"
 332 |       text: "http://foo.jp http://fooooo.jp"
 333 |       expected: ["http://foo.jp", "http://fooooo.jp"]
 334 | 
 335 |     - description: "Extract URLs with a - or + at the end of the path"
 336 |       text: "Go to http://example.com/a+ or http://example.com/a-"
 337 |       expected: ["http://example.com/a+", "http://example.com/a-"]
 338 | 
 339 |     - description: "Extract URLs with longer paths ending in -"
 340 |       text: "Go to http://example.com/view/slug-url-?foo=bar"
 341 |       expected: ["http://example.com/view/slug-url-?foo=bar"]
 342 | 
 343 |     - description: "Extract URLs with an en dash in the path"
 344 |       text: "Go to https://en.m.wikipedia.org/wiki/Hatfield–McCoy_feud please"
 345 |       expected: ["https://en.m.wikipedia.org/wiki/Hatfield–McCoy_feud"]
 346 | 
 347 |     - description: "Extract URLs beginning with a space"
 348 |       text: "@user Try http:// example.com/path"
 349 |       expected: ["example.com/path"]
 350 | 
 351 |     - description: "Extract long URL without protocol surrounded by CJK characters"
 352 |       text: "これは日本語です。example.com/path/index.html中国語example.com/path한국"
 353 |       expected: ["example.com/path/index.html", "example.com/path"]
 354 | 
 355 |     - description: "Extract short URL without protocol surrounded by CJK characters"
 356 |       text: "twitter.comこれは日本語です。example.com中国語t.co/abcde한국twitter.com example2.comテストtwitter.com/abcde"
 357 |       expected: ["twitter.com", "example.com", "t.co/abcde", "twitter.com", "example2.com", "twitter.com/abcde"]
 358 | 
 359 |     - description: "Extract URLs with and without protocol surrounded by CJK characters"
 360 |       text: "http://twitter.com/これは日本語です。example.com中国語http://t.co/abcde한국twitter.comテストexample2.comテストhttp://twitter.com/abcde"
 361 |       expected: ["http://twitter.com/", "example.com", "http://t.co/abcde", "twitter.com", "example2.com", "http://twitter.com/abcde"]
 362 | 
 363 |     - description: "Extract URLs with protocol and path containing Cyrillic characters"
 364 |       text: "Go to http://twitter.com/Русские_слова"
 365 |       expected: ["http://twitter.com/Русские_слова"]
 366 | 
 367 |     - description: "Extract non-ASCII host name URLs with protocol, but ignore host names bigger than 63 characters. Also handle exceptions for non-ASCII hostnames longer than 256 characters"
 368 |       text: "http://exampleこれは日本語です.com/path/index.html http://あああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああ.com/path/index.html"
 369 |       expected: ["http://exampleこれは日本語です.com/path/index.html"]
 370 | 
 371 |     - description: "Extract short URLs without protocol on ccTLD domains without path"
 372 |       text: "twitter.jp日本語it.so中国語foo.jp it.so foo.jp"
 373 |       expected: ["twitter.jp", "it.so", "foo.jp", "it.so", "foo.jp"]
 374 | 
 375 |     - description: "DO NOT extract invalid URL"
 376 |       text: "Hello http://xn--はじめよう.com/index.html"
 377 |       expected: []
 378 | 
 379 |     - description: "DO NOT Extract URL with domain preceeded by underscore: http://domain-begin_dash_2314352345_dfasd.foo-cow_4352.com"
 380 |       text: "text http://domain-dash_2314352345_dfasd.foo-cow_4352.com"
 381 |       expected: []
 382 | 
 383 |     - description: "DO NOT Extract URLs with a - or + in the middle of an email address"
 384 |       text: "Email me at name.al-lastname@foo.com or name.al+lastname@foo.com"
 385 |       expected: []
 386 | 
 387 |     - description: "Extract URLs with a - in the middle"
 388 |       text: "Find my page at name.al-lastname.com"
 389 |       expected: ["name.al-lastname.com"]
 390 | 
 391 |     - description: "Extract some (tv|co) short URLs without protocol on ccTLD domains without path"
 392 |       text: "MLB.tv vine.co twitch.tv t.co"
 393 |       expected: ["MLB.tv", "vine.co", "twitch.tv", "t.co"]
 394 | 
 395 |     - description: "Extract URLs beginning with a non-breaking space (U+00A0)"
 396 |       text: "@user Try http:// example.com/path"
 397 |       expected: ["example.com/path"]
 398 | 
 399 |     - description: "Extract URLs with underscores and dashes in the subdomain"
 400 |       text: "test http://sub_domain-dash.twitter.com"
 401 |       expected: ["http://sub_domain-dash.twitter.com"]
 402 | 
 403 |     - description: "Extract URL with minimum number of valid characters"
 404 |       text: "test http://a.b.cd"
 405 |       expected: ["http://a.b.cd"]
 406 | 
 407 |     - description: "Extract URLs containing underscores and dashes"
 408 |       text: "test http://a_b.c-d.com"
 409 |       expected: ["http://a_b.c-d.com"]
 410 | 
 411 |     - description: "Extract URLs containing dashes in the subdomain"
 412 |       text: "test http://a-b.c.com"
 413 |       expected: ["http://a-b.c.com"]
 414 | 
 415 |     - description: "Extract URLs with dashes in the domain name"
 416 |       text: "test http://twitter-dash.com"
 417 |       expected: ["http://twitter-dash.com"]
 418 | 
 419 |     - description: "Extract URLs with lots of symbols then a period"
 420 |       text: "http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"
 421 |       expected: ["http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"]
 422 | 
 423 |     - description: "DO NOT extract URLs containing leading dashes in the subdomain"
 424 |       text: "test http://-leadingdash.twitter.com"
 425 |       expected: []
 426 | 
 427 |     - description: "DO NOT extract URLs containing leading dashes in the domain with a subdomain"
 428 |       text: "test http://leadingdash.-twitter.com"
 429 |       expected: []
 430 | 
 431 |     - description: "DO NOT extract URLs containing trailing dashes in the subdomain"
 432 |       text: "test http://trailingdash-.twitter.com"
 433 |       expected: []
 434 | 
 435 |     - description: "DO NOT extract URLs containing trailing dashes in the domain with a subdomain"
 436 |       text: "test http://trailingdash.twitter-.com"
 437 |       expected: []
 438 | 
 439 |     - description: "DO NOT extract URLs containing leading underscores in the subdomain"
 440 |       text: "test http://_leadingunderscore.twitter.com"
 441 |       expected: []
 442 | 
 443 |     - description: "DO NOT extract URLs containing leading underscores in the domain with a subdomain"
 444 |       text: "test http://leadingunderscore._twitter.com"
 445 |       expected: []
 446 | 
 447 |     - description: "DO NOT extract URLs containing trailing underscores in the subdomain"
 448 |       text: "test http://trailingunderscore_.twitter.com"
 449 |       expected: []
 450 | 
 451 |     - description: "DO NOT extract URLs containing trailing underscores in the domain with a subdomain"
 452 |       text: "test http://trailingunderscore.twitter_.com"
 453 |       expected: []
 454 | 
 455 |     - description: "DO NOT extract URLs containing leading dashes in the domain name"
 456 |       text: "test http://-twitter.com"
 457 |       expected: []
 458 | 
 459 |     - description: "DO NOT extract URLs containing trailing dashes in the domain name"
 460 |       text: "test http://twitter-.com"
 461 |       expected: []
 462 | 
 463 |     - description: "DO NOT extract URLs containing underscores in the domain name"
 464 |       text: "test http://twitter_underscore.com"
 465 |       expected: []
 466 | 
 467 |     - description: "DO NOT extract URLs containing underscores in the tld"
 468 |       text: "test http://twitter.c_o_m"
 469 |       expected: []
 470 | 
 471 |     - description: "Extract valid URL http://www.foo.com/foo/path-with-period./"
 472 |       text: "test http://www.foo.com/foo/path-with-period./"
 473 |       expected: ["http://www.foo.com/foo/path-with-period./"]
 474 | 
 475 |     - description: "Extract valid URL http://www.foo.org.za/foo/bar/688.1"
 476 |       text: "test http://www.foo.org.za/foo/bar/688.1"
 477 |       expected: ["http://www.foo.org.za/foo/bar/688.1"]
 478 | 
 479 |     - description: "Extract valid URL http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"
 480 |       text: "test http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"
 481 |       expected: ["http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"]
 482 | 
 483 |     - description: "Extract valid URL http://foo.com/bar/123/foo_&_bar/"
 484 |       text: "test http://foo.com/bar/123/foo_&_bar/"
 485 |       expected: ["http://foo.com/bar/123/foo_&_bar/"]
 486 | 
 487 |     - description: "Extract valid URL http://www.cp.sc.edu/events/65"
 488 |       text: "test http://www.cp.sc.edu/events/65 test"
 489 |       expected: ["http://www.cp.sc.edu/events/65"]
 490 | 
 491 |     - description: "Extract valid URL http://www.andersondaradio.no.comunidades.net/"
 492 |       text: "http://www.andersondaradio.no.comunidades.net/ test test"
 493 |       expected: ["http://www.andersondaradio.no.comunidades.net/"]
 494 | 
 495 |     - description: "Extract valid URL ELPAÍS.com"
 496 |       text: "test ELPAÍS.com"
 497 |       expected: ["ELPAÍS.com"]
 498 | 
 499 |     - description: "DO NOT include period at the end of URL"
 500 |       text: "test http://twitter.com/."
 501 |       expected: ["http://twitter.com/"]
 502 | 
 503 |     - description: "Extract a URL with '?' in fragment"
 504 |       text: "http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"
 505 |       expected: ["http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"]
 506 | 
 507 |     - description: "Extract a URL with '?' in fragment in a text"
 508 |       text: "text http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata text"
 509 |       expected: ["http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"]
 510 | 
 511 |    # A common cause of runaway regex engines.
 512 |     - description: "Extract a URL with a ton of trailing periods"
 513 |       text: "Test a ton of periods http://example.com/path.........................................."
 514 |       expected: ["http://example.com/path"]
 515 | 
 516 |     - description: "Extract a URL with a ton of trailing commas"
 517 |       text: "Test a ton of periods http://example.com/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"
 518 |       expected: ["http://example.com/"]
 519 | 
 520 |     - description: "Extract a URL with a ton of trailing '!'"
 521 |       text: "Test a ton of periods http://example.com/path/!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
 522 |       expected: ["http://example.com/path/"]
 523 | 
 524 |     - description: "DO NOT extract URLs in hashtag or @mention"
 525 |       text: "#test.com @test.com #http://test.com @http://test.com #t.co/abcde @t.co/abcde"
 526 |       expected: []
 527 | 
 528 |     - description: "Extract a t.co URL with a trailing apostrophe"
 529 |       text: "I really like http://t.co/pbY2NfTZ's website"
 530 |       expected: ["http://t.co/pbY2NfTZ"]
 531 | 
 532 |     - description: "Extract a t.co URL with a trailing hyphen"
 533 |       text: "Check this site out http://t.co/FNkPfmii- it's great"
 534 |       expected: ["http://t.co/FNkPfmii"]
 535 | 
 536 |     - description: "Extract a t.co URL with a trailing colon"
 537 |       text: "According to http://t.co/ulYGBYSo: the internet is cool"
 538 |       expected: ["http://t.co/ulYGBYSo"]
 539 | 
 540 |     - description: "Extract a t.co URL with a long path"
 541 |       text: "I really like http://t.co/abcdefghijklmnopqrstuvwxyz0123456789"
 542 |       expected: ["http://t.co/abcdefghijklmnopqrstuvwxyz0123456789"]
 543 | 
 544 |     - description: "DO NOT extract URLs with > 40 characters in a t.co slug"
 545 |       text: "I really like http://t.co/abcdefghijklmnopqrstuvwxyz012345678901234"
 546 |       expected: []
 547 | 
 548 |     - description: "Extract domain followed by Japanese characters"
 549 |       text: "example.comてすとですtwitter.みんなです"
 550 |       expected: ["example.com", "twitter.みんな"]
 551 | 
 552 |     - description: "Extract URL before newline"
 553 |       text: "http://twitter.com\nhttp://example.com\nhttp://example.com/path\nexample.com/path\nit.so\nit.so/abcde"
 554 |       expected: ["http://twitter.com", "http://example.com", "http://example.com/path", "example.com/path", "it.so", "it.so/abcde"]
 555 | 
 556 |     - description: "DO NOT extract URL if preceded by $"
 557 |       text: "$http://twitter.com $twitter.com $http://t.co/abcde $t.co/abcde $t.co $TVI.CA $RBS.CA"
 558 |       expected: []
 559 | 
 560 |     - description: "DO NOT extract .bz2 file name as URL"
 561 |       text: "long.test.tar.bz2 test.tar.bz2 tar.bz2"
 562 |       expected: []
 563 | 
 564 |     - description: "DO NOT extract URL with gTLD followed by @ sign"
 565 |       text: "john.doe.gov@mail.com"
 566 |       expected: []
 567 | 
 568 |     - description: "DO NOT extract URL with ccTLD followed by @ sign"
 569 |       text: "john.doe.jp@mail.com"
 570 |       expected: []
 571 | 
 572 |   urls_with_indices:
 573 |     - description: "Extract a URL"
 574 |       text: "text http://google.com"
 575 |       expected:
 576 |         - url: "http://google.com"
 577 |           indices: [5, 22]
 578 | 
 579 |     - description: "Extract a URL from a Japanese tweet"
 580 |       text: "皆さん見てください！ http://google.com"
 581 |       expected:
 582 |         - url: "http://google.com"
 583 |           indices: [11, 28]
 584 | 
 585 |     - description: "Extract URLs without protocol on ccTLD with slash"
 586 |       text: "t.co/abcde bit.ly/abcde"
 587 |       expected:
 588 |         - url: "t.co/abcde"
 589 |           indices: [0, 10]
 590 |         - url: "bit.ly/abcde"
 591 |           indices: [11, 23]
 592 | 
 593 |     - description: "Extract URLs without protocol surrounded by CJK characters"
 594 |       text: "twitter.comこれは日本語です。example.com中国語t.co/abcde한국twitter.com example2.comテストtwitter.com/abcde"
 595 |       expected:
 596 |         - url: "twitter.com"
 597 |           indices: [0, 11]
 598 |         - url: "example.com"
 599 |           indices: [20, 31]
 600 |         - url: "t.co/abcde"
 601 |           indices: [34, 44]
 602 |         - url: "twitter.com"
 603 |           indices: [46, 57]
 604 |         - url: "example2.com"
 605 |           indices: [58, 70]
 606 |         - url: "twitter.com/abcde"
 607 |           indices: [73, 90]
 608 | 
 609 |     - description: "Extract URLs with and without protocol surrounded by CJK characters"
 610 |       text: "http://twitter.com/これは日本語です。example.com中国語http://t.co/abcde한국twitter.comテストexample2.comテストhttp://twitter.com/abcde"
 611 |       expected:
 612 |         - url: "http://twitter.com/"
 613 |           indices: [0, 19]
 614 |         - url: "example.com"
 615 |           indices: [28, 39]
 616 |         - url: "http://t.co/abcde"
 617 |           indices: [42, 59]
 618 |         - url: "twitter.com"
 619 |           indices: [61, 72]
 620 |         - url: "example2.com"
 621 |           indices: [75, 87]
 622 |         - url: "http://twitter.com/abcde"
 623 |           indices: [90, 114]
 624 | 
 625 |     - description: "Extract t.co URLs skipping trailing characters and adjusting indices correctly"
 626 |       text: "http://t.co/pbY2NfTZ's http://t.co/2vYHpAc5; http://t.co/ulYGBYSo: http://t.co/8MkmHU0k+c http://t.co/TKLp64dY.x http://t.co/8t7G3ddS#a http://t.co/FNkPfmii-"
 627 |       expected:
 628 |         - url: "http://t.co/pbY2NfTZ"
 629 |           indices: [0, 20]
 630 |         - url: "http://t.co/2vYHpAc5"
 631 |           indices: [23, 43]
 632 |         - url: "http://t.co/ulYGBYSo"
 633 |           indices: [45, 65]
 634 |         - url: "http://t.co/8MkmHU0k"
 635 |           indices: [67, 87]
 636 |         - url: "http://t.co/TKLp64dY"
 637 |           indices: [90, 110]
 638 |         - url: "http://t.co/8t7G3ddS"
 639 |           indices: [113, 133]
 640 |         - url: "http://t.co/FNkPfmii"
 641 |           indices: [136, 156]
 642 | 
 643 |     - description: "Properly extract URL that contains t.co in referer"
 644 |       text: "http://www.foo.com?referer=https://t.co/abcde http://t.co/xyzzy"
 645 |       expected:
 646 |         - url: "http://www.foo.com?referer=https://t.co/abcde"
 647 |           indices: [0, 45]
 648 |         - url: "http://t.co/xyzzy"
 649 |           indices: [46, 63]
 650 | 
 651 |     - description: "Extract correct indices for duplicate instances of the same URL"
 652 |       text: "http://t.co http://t.co"
 653 |       expected:
 654 |         - url: "http://t.co"
 655 |           indices: [0, 11]
 656 |         - url: "http://t.co"
 657 |           indices: [12, 23]
 658 | 
 659 |     - description: "Extract I18N URL"
 660 |       text: "test http://xn--ls8h.XN--ls8h.la/"
 661 |       expected:
 662 |         - url: "http://xn--ls8h.XN--ls8h.la/"
 663 |           indices: [5, 33]
 664 | 
 665 |     - description: "Extract URLs with IDN(not encoded)"
 666 |       text: "test http://foobar.みんな/ http://foobar.中国/ http://foobar.پاکستان/ "
 667 |       expected:
 668 |         - url: "http://foobar.みんな/"
 669 |           indices: [5, 23]
 670 |         - url: "http://foobar.中国/"
 671 |           indices: [24, 41]
 672 |         - url: "http://foobar.پاکستان/"
 673 |           indices: [42, 64]
 674 | 
 675 |   urls_with_directional_markers:
 676 |     - description: "Extract URLs from RTL text"
 677 |       text: "\U00002066\U0000202Atest abcdef.com پاکستان http://twitter.com/\U0000202C\U00002069"
 678 |       expected:
 679 |         - url: "abcdef.com"
 680 |           indices: [7, 17]
 681 |         - url: "http://twitter.com/"
 682 |           indices: [26, 45]
 683 | 
 684 |     - description: "Extract URLs from RTL text with embedded directional marks"
 685 |       text: "This is a test \U00002066\U0000202Atwitter.com\U0000202C\U00002069 \U00002066\U0000202Ahttp://foobar.پاکستان/\U0000202C\U00002069⁩ قطر فلسطين عمان"
 686 |       expected:
 687 |         - url: "twitter.com"
 688 |           indices: [17, 28]
 689 |         - url: "http://foobar.پاکستان/"
 690 |           indices: [33, 55]
 691 | 
 692 |   tco_urls_with_params:
 693 |     - description: "Extract valid URL with params: https://t.co/UqIyJAJTfo?amp=1"
 694 |       text: "text https://t.co/UqIyJAJTfo?amp=1"
 695 |       expected: ["https://t.co/UqIyJAJTfo?amp=1"]
 696 | 
 697 |     - description: "Extract valid URL with params: https://t.co/UqIyJAJTfo?type=js"
 698 |       text: "text https://t.co/UqIyJAJTfo?type=js"
 699 |       expected: ["https://t.co/UqIyJAJTfo?type=js"]
 700 | 
 701 |     - description: "Extract valid URL with params: https://t.co/UqIyJAJTfo?ssr=true"
 702 |       text: "text https://t.co/UqIyJAJTfo?ssr=true"
 703 |       expected: ["https://t.co/UqIyJAJTfo?ssr=true"]
 704 | 
 705 |     - description: "Extract a valid URL with params: https://t.co/asdfdf?a=b#123"
 706 |       text: "text https://t.co/asdfdf?a=b#123"
 707 |       expected: ["https://t.co/asdfdf?a=b#123"]
 708 | 
 709 |     - description: "Extract a valid URL with params: https://t.co/sadfasdf?a=b&c=d"
 710 |       text: "text https://t.co/sadfasdf?a=b&c=d"
 711 |       expected: ["https://t.co/sadfasdf?a=b&c=d"]
 712 | 
 713 |   hashtags:
 714 |     - description: "Extract hashtag after emoji without variant selector (uFE0E or uFE0F)"
 715 |       text: "a ✌#hashtag here"
 716 |       expected: ["hashtag"]
 717 | 
 718 |     - description: "Extract hashtag after emoji with variant selector FE0E"
 719 |       text: "a ✌︎#hashtag here"
 720 |       expected: ["hashtag"]
 721 | 
 722 |     - description: "Extract hashtag after emoji with variant selector FE0F"
 723 |       text: "a ✌️#hashtag here"
 724 |       expected: ["hashtag"]
 725 | 
 726 |     - description: "Extract hashtag after emoji with skin tone without variant selector (FE0E or FE0F)"
 727 |       text: "a ✌🏿#hashtag here"
 728 |       expected: ["hashtag"]
 729 | 
 730 |     - description: "Extract hashtag after emoji with skin tone with variant selector FE0F"
 731 |       text: "a ✌🏿️#hashtag here"
 732 |       expected: ["hashtag"]
 733 | 
 734 |     - description: "Extract hashtag after emoji with zero-width-joiner"
 735 |       text: "a 👨‍👩‍👧#hashtag here"
 736 |       expected: ["hashtag"]
 737 | 
 738 |     - description: "Extract an all-alpha hashtag"
 739 |       text: "a #hashtag here"
 740 |       expected: ["hashtag"]
 741 | 
 742 |     - description: "Extract a letter-then-number hashtag"
 743 |       text: "this is #hashtag1"
 744 |       expected: ["hashtag1"]
 745 | 
 746 |     - description: "Extract a number-then-letter hashtag"
 747 |       text: "#1hashtag is this"
 748 |       expected: ["1hashtag"]
 749 | 
 750 |     - description: "DO NOT Extract an all-numeric hashtag"
 751 |       text: "On the #16 bus"
 752 |       expected: []
 753 | 
 754 |     - description: "DO NOT Extract a single numeric hashtag"
 755 |       text: "#0"
 756 |       expected: []
 757 | 
 758 |     - description: "Extract hashtag after bracket"
 759 |       text: "(#hashtag1 )#hashtag2 [#hashtag3 ]#hashtag4 ’#hashtag5’#hashtag6"
 760 |       expected: ["hashtag1", "hashtag2", "hashtag3", "hashtag4", "hashtag5", "hashtag6"]
 761 | 
 762 |     - description: "Extract a hashtag containing ñ"
 763 |       text: "I'll write more tests #mañana"
 764 |       expected: ["mañana"]
 765 | 
 766 |     - description: "Extract a hashtag containing é"
 767 |       text: "Working remotely #café"
 768 |       expected: ["café"]
 769 | 
 770 |     - description: "Extract a hashtag containing ü"
 771 |       text: "Getting my Oktoberfest on #münchen"
 772 |       expected: ["münchen"]
 773 | 
 774 |     - description: "DO NOT Extract a hashtag containing Japanese"
 775 |       text: "this is not valid: # 会議中 ハッシュ"
 776 |       expected: []
 777 | 
 778 |     - description: "Extract a hashtag in Korean"
 779 |       text: "What is #트위터 anyway?"
 780 |       expected: ["트위터"]
 781 | 
 782 |     - description: "Extract a half-width Hangul hashtag"
 783 |       text: "Just random half-width Hangul #ﾣﾦﾰ"
 784 |       expected: ["ﾣﾦﾰ"]
 785 | 
 786 |     - description: "Extract a hashtag in Russian"
 787 |       text: "What is #ашок anyway?"
 788 |       expected: ["ашок"]
 789 | 
 790 |     - description: "Extract a starting katakana hashtag"
 791 |       text: "#カタカナ is a hashtag"
 792 |       expected: ["カタカナ"]
 793 | 
 794 |     - description: "Extract a starting hiragana hashtag"
 795 |       text: "#ひらがな FTW!"
 796 |       expected: ["ひらがな"]
 797 | 
 798 |     - description: "Extract a starting kanji hashtag"
 799 |       text: "#漢字 is the future"
 800 |       expected: ["漢字"]
 801 | 
 802 |     - description: "Extract a trailing katakana hashtag"
 803 |       text: "Hashtag #カタカナ"
 804 |       expected: ["カタカナ"]
 805 | 
 806 |     - description: "Extract a trailing hiragana hashtag"
 807 |       text: "Japanese hashtags #ひらがな"
 808 |       expected: ["ひらがな"]
 809 | 
 810 |     - description: "Extract a trailing kanji hashtag"
 811 |       text: "Study time #漢字"
 812 |       expected: ["漢字"]
 813 | 
 814 |     - description: "Extract a central katakana hashtag"
 815 |       text: "See my #カタカナ hashtag?"
 816 |       expected: ["カタカナ"]
 817 | 
 818 |     - description: "Extract a central hiragana hashtag"
 819 |       text: "Study #ひらがな for fun and profit"
 820 |       expected: ["ひらがな"]
 821 | 
 822 |     - description: "Extract a central kanji hashtag"
 823 |       text: "Some say #漢字 is the past. what do they know?"
 824 |       expected: ["漢字"]
 825 | 
 826 |     - description: "Extract a Kanji/Katakana mixed hashtag"
 827 |       text: "日本語ハッシュタグテスト #日本語ハッシュタグ"
 828 |       expected: ["日本語ハッシュタグ"]
 829 | 
 830 |     - description: "Extract a hashtag after a punctuation"
 831 |       text: "日本語ハッシュテスト。#日本語ハッシュタグ"
 832 |       expected: ["日本語ハッシュタグ"]
 833 | 
 834 |     - description: "DO NOT include a punctuation in a hashtag"
 835 |       text: "#日本語ハッシュタグ。"
 836 |       expected: ["日本語ハッシュタグ"]
 837 | 
 838 |     - description: "Extract a full-width Alnum hashtag"
 839 |       text: "全角英数字ハッシュタグ ＃ｈａｓｈｔａｇ１２３"
 840 |       expected: ["ｈａｓｈｔａｇ１２３"]
 841 | 
 842 |     - description: "DO NOT extract a hashtag without a preceding space"
 843 |       text: "日本語ハッシュタグ#日本語ハッシュタグ"
 844 |       expected: []
 845 | 
 846 |     - description: "Hashtag with chouon"
 847 |       text: "長音ハッシュタグ。#サッカー"
 848 |       expected: ["サッカー"]
 849 | 
 850 |     - description: "Hashtag with half-width chouon"
 851 |       text: "長音ハッシュタグ。#ｻｯｶｰ"
 852 |       expected: ["ｻｯｶｰ"]
 853 | 
 854 |     - description: "Hashtag with half-widh voiced sounds marks"
 855 |       text: "#ﾊｯｼｭﾀｸﾞ #ﾊﾟﾋﾟﾌﾟﾍﾟﾎﾟ"
 856 |       expected: ["ﾊｯｼｭﾀｸﾞ", "ﾊﾟﾋﾟﾌﾟﾍﾟﾎﾟ"]
 857 | 
 858 |     - description: "Hashtag with half-width # after full-width ！"
 859 |       text: "できましたよー！#日本語ハッシュタグ。"
 860 |       expected: ["日本語ハッシュタグ"]
 861 | 
 862 |     - description: "Hashtag with full-width ＃ after full-width ！"
 863 |       text: "できましたよー！＃日本語ハッシュタグ。"
 864 |       expected: ["日本語ハッシュタグ"]
 865 | 
 866 |     - description: "Hashtag with ideographic iteration mark"
 867 |       text: "#云々 #学問のすゝめ #いすゞ #各〻 #各〃"
 868 |       expected: ["云々", "学問のすゝめ", "いすゞ", "各〻", "各〃"]
 869 | 
 870 |     - description: "Extract hashtag with fullwidth tilde"
 871 |       text: "#メ～テレ ハッシュタグ内で～が認識されず"
 872 |       expected: ["メ～テレ"]
 873 | 
 874 |     - description: "Extract hashtag with wave dash"
 875 |       text: "#メ〜テレ ハッシュタグ内で～が認識されず"
 876 |       expected: ["メ〜テレ"]
 877 | 
 878 |     - description: "Hashtags with ş (U+015F)"
 879 |       text: "Here’s a test tweet for you: #Ateş #qrşt #ştu #ş"
 880 |       expected: ["Ateş", "qrşt", "ştu", "ş"]
 881 | 
 882 |     - description: "Hashtags with İ (U+0130) and ı (U+0131)"
 883 |       text: "Here’s a test tweet for you: #İn #ın"
 884 |       expected: ["İn", "ın"]
 885 | 
 886 |     - description: "Hashtag before punctuations"
 887 |       text: "#hashtag: #hashtag; #hashtag, #hashtag. #hashtag! #hashtag?"
 888 |       expected: ["hashtag", "hashtag", "hashtag", "hashtag", "hashtag", "hashtag"]
 889 | 
 890 |     - description: "Hashtag after punctuations"
 891 |       text: ":#hashtag ;#hashtag ,#hashtag .#hashtag !#hashtag ?#hashtag"
 892 |       expected: ["hashtag", "hashtag", "hashtag", "hashtag", "hashtag", "hashtag"]
 893 | 
 894 |     - description: "Hashtag before newline"
 895 |       text: "#hashtag\ntest\n#hashtag2\ntest\n#hashtag3\n"
 896 |       expected: ["hashtag", "hashtag2", "hashtag3"]
 897 | 
 898 |     - description: "DO NOT extract hashtag when # is followed by URL"
 899 |       text: "#http://twitter.com #https://twitter.com"
 900 |       expected: []
 901 | 
 902 |     - description: "DO NOT extract hashtag if it's a part of URL"
 903 |       text: "http://twitter.com/#hashtag twitter.com/#hashtag"
 904 |       expected: []
 905 | 
 906 |     - description: "Extract hashtags with Latin extended characters"
 907 |       text: "#Azərbaycanca #mûǁae #Čeština #Ċaoiṁín"
 908 |       expected: ["Azərbaycanca", "mûǁae", "Čeština", "Ċaoiṁín"]
 909 | 
 910 |     - description: "Extract Arabic hashtags"
 911 |       text: "#سیاست #ایران #السياسة #السياح #لغات  #اتمی  #کنفرانس #العربية #الجزيرة #فارسی"
 912 |       expected: ["سیاست", "ایران", "السياسة", "السياح", "لغات", "اتمی", "کنفرانس", "العربية", "الجزيرة", "فارسی"]
 913 | 
 914 |     - description: "Extract Arabic hashtags with underscore"
 915 |       text: "#برنامه_نویسی  #رییس_جمهور  #رئيس_الوزراء, #ثبت_نام. #لس_آنجلس"
 916 |       expected: ["برنامه_نویسی", "رییس_جمهور", "رئيس_الوزراء", "ثبت_نام", "لس_آنجلس"]
 917 | 
 918 |     - description: "Extract Hebrew hashtags"
 919 |       text: "#עַל־יְדֵי #וכו׳ #מ״כ"
 920 |       expected: ["עַל־יְדֵי", "וכו׳", "מ״כ"]
 921 | 
 922 |     - description: "Extract Thai hashtags"
 923 |       text: "#ผู้เริ่ม #การเมือง #รายละเอียด #นักท่องเที่ยว #ของขวัญ #สนามบิน #เดินทาง #ประธาน"
 924 |       expected: ["ผู้เริ่ม", "การเมือง", "รายละเอียด", "นักท่องเที่ยว", "ของขวัญ", "สนามบิน", "เดินทาง", "ประธาน"]
 925 | 
 926 |     - description: "Extract Arabic hashtags with Zero-Width Non-Joiner"
 927 |       text: "#أي‌بي‌إم #می‌خواهم"
 928 |       expected: ["أي‌بي‌إم", "می‌خواهم"]
 929 | 
 930 |     - description: "Extract Amharic hashtag"
 931 |       text: "የአላህ መልእክተኛ ሰለላሁ ዓለይሂ ወሰለም #ኢትዮሙስሊምስ"
 932 |       expected: ["ኢትዮሙስሊምስ"]
 933 | 
 934 |     - description: "Extract Sinhala hashtag with Zero-Width Joiner (U+200D)"
 935 |       text: "#ශ්‍රීලංකා"
 936 |       expected: ["ශ්‍රීලංකා"]
 937 | 
 938 |     - description: "Extract Arabic and Persian hashtags with numbers"
 939 |       text: "#۳۴۵هشتگ #هشتگ۶۷۸ #ســـلام_عليكم_٤٠٦"
 940 |       expected: ["۳۴۵هشتگ","هشتگ۶۷۸","ســـلام_عليكم_٤٠٦"]
 941 | 
 942 |     - description: "Extract Hindi hashtags"
 943 |       text: "#महात्मा #महात्मा_१२३४ #१२३४ गांधी"
 944 |       expected: ["महात्मा","महात्मा_१२३४"]
 945 | 
 946 |     - description: "Extract Indic script hashtags"
 947 |       text: "#বাংলা #ગુજરાતી #ಕನ್ನಡ #മലയാളം #ଓଡ଼ିଆ #ਪੰਜਾਬੀ #සිංහල #தமிழ் #తెలుగు"
 948 |       expected: ["বাংলা","ગુજરાતી","ಕನ್ನಡ","മലയാളം","ଓଡ଼ିଆ","ਪੰਜਾਬੀ","සිංහල","தமிழ்","తెలుగు"]
 949 | 
 950 |     - description: "Extract Tibetan hashtags"
 951 |       text: "#བོད་སྐད་ #བོད་སྐད"
 952 |       expected: ["བོད་སྐད་","བོད་སྐད"]
 953 | 
 954 |     - description: "Extract Khmer, Burmese, Laotian hashtags"
 955 |       text: "#មហាត្មះគន្ធី #မြင့်မြတ်သော #ຊີວະສາດ"
 956 |       expected: ["មហាត្មះគន្ធី","မြင့်မြတ်သော","ຊີວະສາດ"]
 957 | 
 958 |     - description: "Extract Greek hashtag"
 959 |       text: "#Μαχάτμα_Γκάντι ήταν Ινδός πολιτικός"
 960 |       expected: ["Μαχάτμα_Γκάντι"]
 961 | 
 962 |     - description: "Extract Armenian and Georgian hashtags"
 963 |       text: "#Մահաթմա #მაჰათმა"
 964 |       expected: ["Մահաթմա","მაჰათმა"]
 965 | 
 966 |     - description: "Extract hashtag with middle dot"
 967 |       text: "#il·lusió"
 968 |       expected: ["il·lusió"]
 969 | 
 970 |     - description: "DO NOT extract hashtags without a letter"
 971 |       text: "#_ #1_2 #122 #〃"
 972 |       expected: []
 973 | 
 974 |   hashtags_from_astral:
 975 |     - description: "Extract hashtag with letter from astral plane (U+20021)"
 976 |       text: "#\U00020021"
 977 |       expected: ["\U00020021"]
 978 | 
 979 |     - description: "Extract hashtag with letter plus marker from astral plane (U+16f04 U+16f51)"
 980 |       text: "#\U00016f04\U00016f51"
 981 |       expected: ["\U00016f04\U00016f51"]
 982 | 
 983 |     - description: "Extract hashtag with letter plus number from astral plane (U+104a0)"
 984 |       text: "#\U00000041\U000104a0"
 985 |       expected: ["A\U000104a0"]
 986 | 
 987 |   hashtags_with_indices:
 988 |     - description: "Extract a hastag at the start"
 989 |       text: "#hashtag here"
 990 |       expected:
 991 |         - hashtag: "hashtag"
 992 |           indices: [0, 8]
 993 | 
 994 |     - description: "Extract a hastag at the end"
 995 |       text: "test a #hashtag"
 996 |       expected:
 997 |         - hashtag: "hashtag"
 998 |           indices: [7, 15]
 999 | 
1000 |     - description: "Extract a hastag in the middle"
1001 |       text: "test a #hashtag in a string"
1002 |       expected:
1003 |         - hashtag: "hashtag"
1004 |           indices: [7, 15]
1005 | 
1006 |     - description: "Extract only a valid hashtag"
1007 |       text: "#123 a #hashtag in a string"
1008 |       expected:
1009 |         - hashtag: "hashtag"
1010 |           indices: [7, 15]
1011 | 
1012 |     - description: "Extract a hashtag in a string of multi-byte characters"
1013 |       text: "会議中 #hashtag 会議中"
1014 |       expected:
1015 |         - hashtag: "hashtag"
1016 |           indices: [4, 12]
1017 | 
1018 |     - description: "Extract multiple valid hashtags"
1019 |       text: "One #two three #four"
1020 |       expected:
1021 |         - hashtag: "two"
1022 |           indices: [4, 8]
1023 |         - hashtag: "four"
1024 |           indices: [15, 20]
1025 | 
1026 |     - description: "Extract a non-latin hashtag"
1027 |       text: "Hashtags in #русский!"
1028 |       expected:
1029 |         - hashtag: "русский"
1030 |           indices: [12, 20]
1031 | 
1032 |     - description: "Extract multiple non-latin hashtags"
1033 |       text: "Hashtags in #中文, #日本語, #한국말, and #русский! Try it out!"
1034 |       expected:
1035 |         - hashtag: "中文"
1036 |           indices: [12, 15]
1037 |         - hashtag: "日本語"
1038 |           indices: [17, 21]
1039 |         - hashtag: "한국말"
1040 |           indices: [23, 27]
1041 |         - hashtag: "русский"
1042 |           indices: [33, 41]
1043 | 
1044 |   cashtags:
1045 |     - description: "Extract cashtags"
1046 |       text: "Example cashtags: $TEST $Stock   $symbol"
1047 |       expected: ["TEST", "Stock", "symbol"]
1048 | 
1049 |     - description: "Extract cashtags with . or _"
1050 |       text: "Example cashtags: $TEST.T $test.tt $Stock_X $symbol_ab"
1051 |       expected: ["TEST.T", "test.tt", "Stock_X", "symbol_ab"]
1052 | 
1053 |     - description: "Do not extract cashtags if they contain numbers"
1054 |       text: "$123 $test123 $TE123ST"
1055 |       expected: []
1056 | 
1057 |     - description: "Do not extract cashtags with non-ASCII characters"
1058 |       text: "$ストック $株"
1059 |       expected: []
1060 | 
1061 |     - description: "Do not extract cashtags with punctuations"
1062 |       text: "$ $. $- $@ $! $() $+"
1063 |       expected: []
1064 | 
1065 |     - description: "Do not include trailing . or _"
1066 |       text: "$TEST. $TEST_"
1067 |       expected: ["TEST", "TEST"]
1068 | 
1069 |     - description: "Do not extract cashtags if there is no space before $"
1070 |       text: "$OK$NG$BAD text$NO .$NG $$NG"
1071 |       expected: ["OK"]
1072 | 
1073 |     - description: "Do not extract too long cashtags"
1074 |       text: "$CashtagMustBeLessThanSixCharacter"
1075 |       expected: []
1076 | 
1077 |   cashtags_with_indices:
1078 |     - description: "Extract cashtags"
1079 |       text: "Example: $TEST $symbol test"
1080 |       expected:
1081 |         - cashtag: "TEST"
1082 |           indices: [9, 14]
1083 |         - cashtag: "symbol"
1084 |           indices: [15, 22]
1085 | 
1086 |     - description: "Extract cashtags with . or _"
1087 |       text: "Example: $TEST.T test $symbol_ab end"
1088 |       expected:
1089 |         - cashtag: "TEST.T"
1090 |           indices: [9, 16]
1091 |         - cashtag: "symbol_ab"
1092 |           indices: [22, 32]


--------------------------------------------------------------------------------
/tests/cases/validate.yml:
--------------------------------------------------------------------------------
  1 | tests:
  2 |   tweets:
  3 |     - description: "Valid Tweet: < 20 characters"
  4 |       text: "I am a Tweet"
  5 |       expected: true
  6 | 
  7 |     - description: "Valid Tweet: 140 characters"
  8 |       text: "A lie gets halfway around the world before the truth has a chance to get its pants on. Winston Churchill (1874-1965) http://bit.ly/dJpywL"
  9 |       expected: true
 10 | 
 11 |     - description: "Valid Tweet: 140 characters (with accents)"
 12 |       text: "A lié géts halfway arøünd thé wørld béføré thé truth has a chance tø get its pants øn. Winston Churchill (1874-1965) http://bit.ly/dJpywL"
 13 |       expected: true
 14 | 
 15 |     - description: "Valid Tweet: 140 characters (double byte characters)"
 16 |       text: "のののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののの"
 17 |       expected: true
 18 | 
 19 |     - description: "Valid Tweet: 140 characters (double word characters)"
 20 |       text: "\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431"
 21 |       expected: true
 22 | 
 23 |     - description: "Invalid Tweet: no characters (empty)"
 24 |       text: ""
 25 |       expected: false
 26 | 
 27 |     - description: "Invalid Tweet: 141 characters"
 28 |       text: "A lie gets halfway around the world before the truth has a chance to get its pants on. -- Winston Churchill (1874-1965) http://bit.ly/dJpywL"
 29 |       expected: false
 30 | 
 31 |     - description: "Invalid Tweet: 141 characters (due to newline)"
 32 |       text: "A lie gets halfway around the world before the truth has a chance to get its pants on. \n- Winston Churchill (1874-1965) http://bit.ly/dJpywL"
 33 |       expected: false
 34 | 
 35 |   usernames:
 36 |     - description: "Valid username: a-z < 20 characters"
 37 |       text: "@username"
 38 |       expected: true
 39 | 
 40 |     - description: "All numeric username are allowed"
 41 |       text: "@12345"
 42 |       expected: true
 43 | 
 44 |     - description: "Usernames should allow the _ character"
 45 |       text: "@example_name"
 46 |       expected: true
 47 | 
 48 |     - description: "Usernames SHOULD NOT allow the - character"
 49 |       text: "@example-name"
 50 |       expected: false
 51 | 
 52 |   lists:
 53 |     - description: "Valid list: a-z < 20 characters"
 54 |       text: "@username/list"
 55 |       expected: true
 56 | 
 57 |     - description: "A username alone SHOULD NOT be considered a valid list"
 58 |       text: "@username"
 59 |       expected: false
 60 | 
 61 |     - description: "A username followed by a slash SHOULD NOT be considered a valid list"
 62 |       text: "@username/"
 63 |       expected: false
 64 | 
 65 |     - description: "Validation SHOULD NOT allow leading spaces"
 66 |       text: " @username/list"
 67 |       expected: false
 68 | 
 69 |     - description: "Validation SHOULD NOT allow trailing spaces"
 70 |       text: "@username/list "
 71 |       expected: false
 72 | 
 73 |   hashtags:
 74 |     - description: "Valid hashtag: a-z < 20 characters"
 75 |       text: "#hashtag"
 76 |       expected: true
 77 | 
 78 |     - description: "Valid hashtag: number followed by letters"
 79 |       text: "#1st"
 80 |       expected: true
 81 | 
 82 |     - description: "Valid hashtag: letters and numbers mixed"
 83 |       text: "#that1time"
 84 |       expected: true
 85 | 
 86 |     - description: "Valid hashtag: letter followed by numbers"
 87 |       text: "#easyas123"
 88 |       expected: true
 89 | 
 90 |     - description: "Invalid hashtag: all numbers"
 91 |       text: "#12345"
 92 |       expected: false
 93 | 
 94 |     - description: "Valid hashtag: Russian text"
 95 |       text: "#ашок"
 96 |       expected: true
 97 | 
 98 |     - description: "Valid hashtag: Korean text"
 99 |       text: "#트위터"
100 |       expected: true
101 | 
102 |   urls:
103 |     - description: "Valid url: protocol + domain"
104 |       text: "http://example.com"
105 |       expected: true
106 | 
107 |     - description: "Valid url: ssl + domain + path + query"
108 |       text: "https://example.com/path/to/resource?search=foo&lang=en"
109 |       expected: true
110 | 
111 |     - description: "Valid url: protocol + domain + path + fragment"
112 |       text: "http://twitter.com/#!/twitter"
113 |       expected: true
114 | 
115 |     - description: "Valid url: cased protocol and domain"
116 |       text: "HTTPS://www.ExaMPLE.COM/index.html"
117 |       expected: true
118 | 
119 |     - description: "Valid url: port and userinfo"
120 |       text: "http://user:PASSW0RD@example.com:8080/login.php"
121 |       expected: true
122 | 
123 |     - description: "Valid url: matrix path parameters"
124 |       text: "http://sports.yahoo.com/nfl/news;_ylt=Aom0;ylu=XyZ?slug=ap-superbowlnotebook"
125 |       expected: true
126 | 
127 |     - description: "Valid url: ipv4"
128 |       text: "http://192.168.0.1/index.html?src=asdf"
129 |       expected: true
130 | 
131 |     - description: "Valid url: ipv6"
132 |       text: "http://[3ffe:1900:4545:3:200:f8ff:fe21:67cf]:80/index.html"
133 |       expected: true
134 | 
135 |     - description: "Valid url: underscore in subdomain"
136 |       text: "http://test_underscore.twitter.com"
137 |       expected: true
138 | 
139 |     - description: "Valid url: sub delims and question marks"
140 |       text: "http://example.com?foo=$bar.;baz?BAZ&c=d-#top/?stories+"
141 |       expected: true
142 | 
143 |     - description: "Valid unicode url: unicode domain"
144 |       text: "http://☃.net/"
145 |       expected: true
146 | 
147 |     - description: "Valid url: Cyrillic characters in path"
148 |       text: "http://example.com/Русские_слова"
149 |       expected: true
150 | 
151 |     - description: "Valid url: trailing hyphen"
152 |       text: "https://www.youtube.com/playlist?list=PL0ZPu8XSRTB7wZzn0mLHMvyzVFeRxbWn-"
153 |       expected: true
154 | 
155 |     - description: "Invalid url: invalid scheme"
156 |       text: "ftp://www.example.com/"
157 |       expected: false
158 | 
159 |     - description: "Invalid url: invalid userinfo characters"
160 |       text: "https://user:pass[word]@www.example.com/"
161 |       expected: false
162 | 
163 |     - description: "Invalid url: underscore in domain"
164 |       text: "http://domain-dash_2314352345_dfasd.foo-cow_4352.com"
165 |       expected: false
166 | 
167 |     - description: "Invalid url: domain beginning dash"
168 |       text: "http://www.-domain4352.com/"
169 |       expected: false
170 | 
171 |     - description: "Invalid url: domain trailing dash"
172 |       text: "http://www.domain4352-.com/"
173 |       expected: false
174 | 
175 |     - description: "Invalid url: unicode domain trailing dash"
176 |       text: "http://☃-.net/"
177 |       expected: false
178 | 
179 |     - description: "Invalid url: improperly encoded unicode domain"
180 |       text: "http://%e2%98%83.net/"
181 |       expected: false
182 | 
183 |     - description: "Invalid url: invalid IP"
184 |       text: "http://256.1.2.3/"
185 |       expected: false
186 | 
187 |     - description: "Invalid url: invalid char in path"
188 |       text: "http://en.wikipedia.org/wiki/\"#Punctuation"
189 |       expected: false
190 | 
191 |     - description: "Invalid url: trailing space"
192 |       text: "http://example.com/#anchor "
193 |       expected: false
194 | 
195 |     - description: "Invalid url: domain has leading hyphen"
196 |       text: "http://test.-twitter.com"
197 |       expected: false
198 | 
199 |   urls_without_protocol:
200 |     - description: "Valid url without protocol: domain + gTLD"
201 |       text: "example.com"
202 |       expected: true
203 | 
204 |     - description: "Valid url without protocol: subdomain + domain + gTLD"
205 |       text: "www.example.com"
206 |       expected: true
207 | 
208 |     - description: "Valid url without protocol: domain + ccTLD"
209 |       text: "t.co"
210 |       expected: true
211 | 
212 |     - description: "Valid url without protocol: subdomain + domain + ccTLD"
213 |       text: "foo.co.jp"
214 |       expected: true
215 | 
216 |     - description: "Valid url without protocol: domain + path + query"
217 |       text: "example.com/path/to/resource?search=foo&lang=en"
218 |       expected: true
219 | 
220 |   WeightedTweetsCounterTest:
221 |     - description: "Regular Tweet with url"
222 |       text: "Hi http://test.co"
223 |       expected:
224 |         weightedLength: 26
225 |         valid: true
226 |         permillage: 92
227 |         displayRangeStart: 0
228 |         displayRangeEnd: 16
229 |         validRangeStart: 0
230 |         validRangeEnd: 16
231 | 
232 |     - description: "Just url"
233 |       text: "http://test.co"
234 |       expected:
235 |         weightedLength: 23
236 |         valid: true
237 |         permillage: 82
238 |         displayRangeStart: 0
239 |         displayRangeEnd: 13
240 |         validRangeStart: 0
241 |         validRangeEnd: 13
242 | 
243 |     - description: "Long tweet, overflow at char index 280"
244 |       text: "285 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-"
245 |       expected:
246 |         weightedLength: 285
247 |         valid: false
248 |         permillage: 1017
249 |         displayRangeStart: 0
250 |         displayRangeEnd: 284
251 |         validRangeStart: 0
252 |         validRangeEnd: 279
253 | 
254 |     - description: "Long tweet with url in the middle, overflow at char index 284"
255 |       text: "285 chars- http://www.twitter.com/jack xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-"
256 |       expected:
257 |         weightedLength: 299
258 |         valid: false
259 |         permillage: 1067
260 |         displayRangeStart: 0
261 |         displayRangeEnd: 302
262 |         validRangeStart: 0
263 |         validRangeEnd: 283
264 | 
265 |     - description: "Long tweet with url at the end, overflow at char index 265"
266 |       text: "xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx- http://www.twitter.com/jack "
267 |       expected:
268 |         weightedLength: 289
269 |         valid: false
270 |         permillage: 1032
271 |         displayRangeStart: 0
272 |         displayRangeEnd: 292
273 |         validRangeStart: 0
274 |         validRangeEnd: 264
275 | 
276 |     - description: "10 url string, no overflow"
277 |       text: "https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha "
278 |       expected:
279 |         weightedLength: 240
280 |         valid: true
281 |         permillage: 857
282 |         displayRangeStart: 0
283 |         displayRangeEnd: 299
284 |         validRangeStart: 0
285 |         validRangeEnd: 299
286 | 
287 |     - description: "160 CJK char, overflow at char index 140"
288 |       text: "故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。朱雀桥边野草花，乌衣巷口夕阳斜。旧时王谢堂前燕，飞入寻常百姓家。朝辞白帝彩云间，千里江陵一日还。两岸猿声啼不住，轻舟已过万重山。泪湿罗巾梦不成，夜深前殿按歌声。红颜未老恩先断，斜倚薰笼坐到明。独在异乡为异客，每逢佳节倍思亲。遥知兄弟登高处，遍插茱萸少一人。"
289 |       expected:
290 |         weightedLength: 320
291 |         valid: false
292 |         permillage: 1142
293 |         displayRangeStart: 0
294 |         displayRangeEnd: 159
295 |         validRangeStart: 0
296 |         validRangeEnd: 139
297 | 
298 |     - description: "160 emoji char, overflow at char index 140"
299 |       text: "😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷"
300 |       expected:
301 |         weightedLength: 320
302 |         valid: false
303 |         permillage: 1142
304 |         displayRangeStart: 0
305 |         displayRangeEnd: 319
306 |         validRangeStart: 0
307 |         validRangeEnd: 279
308 | 
309 |     - description: "3 latin char + 160 CJK char, overflow at char index 141"
310 |       text: "the故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。朱雀桥边野草花，乌衣巷口夕阳斜。旧时王谢堂前燕，飞入寻常百姓家。朝辞白帝彩云间，千里江陵一日还。两岸猿声啼不住，轻舟已过万重山。泪湿罗巾梦不成，夜深前殿按歌声。红颜未老恩先断，斜倚薰笼坐到明。独在异乡为异客，每逢佳节倍思亲。遥知兄弟登高处，遍插茱萸少一人。"
311 |       expected:
312 |         weightedLength: 323
313 |         valid: false
314 |         permillage: 1153
315 |         displayRangeStart: 0
316 |         displayRangeEnd: 162
317 |         validRangeStart: 0
318 |         validRangeEnd: 140
319 | 
320 |     - description: "'Á' is normalized into 1 char"
321 |       text: "ÁB"
322 |       expected:
323 |         weightedLength: 2
324 |         valid: true
325 |         permillage: 7
326 |         displayRangeStart: 0
327 |         displayRangeEnd: 2
328 |         validRangeStart: 0
329 |         validRangeEnd: 2
330 | 
331 |     - description: "שּׁ is normalized into 3 chars"
332 |       text: "Aשּׁ"
333 |       expected:
334 |         weightedLength: 4
335 |         valid: true
336 |         permillage: 14
337 |         displayRangeStart: 0
338 |         displayRangeEnd: 1
339 |         validRangeStart: 0
340 |         validRangeEnd: 1
341 | 
342 |     - description: "282 chars with a normalized character within valid range but outside 280"
343 |       text: "282 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxÁx"
344 |       expected:
345 |         weightedLength: 281
346 |         valid: false
347 |         permillage: 1003
348 |         displayRangeStart: 0
349 |         displayRangeEnd: 281
350 |         validRangeStart: 0
351 |         validRangeEnd: 280
352 | 
353 |     - description: "Count a mix of single byte single word, and double word unicode characters"
354 |       text: "H🐱☺👨‍👩‍👧‍👦"
355 |       expected:
356 |         weightedLength: 16
357 |         valid: true
358 |         permillage: 57
359 |         displayRangeStart: 0
360 |         displayRangeEnd: 14
361 |         validRangeStart: 0
362 |         validRangeEnd: 14
363 | 
364 |     - description: "Count unicode emoji chars inside the basic multilingual plane"
365 |       text: "😷👾😡🔥💩"
366 |       expected:
367 |         weightedLength: 10
368 |         valid: true
369 |         permillage: 35
370 |         displayRangeStart: 0
371 |         displayRangeEnd: 9
372 |         validRangeStart: 0
373 |         validRangeEnd: 9
374 | 
375 |     - description: "Count unicode emoji chars outside the basic multilingual plane with skin tone modifiers"
376 |       text: "🙋🏽👨‍🎤"
377 |       expected:
378 |         weightedLength: 9
379 |         valid: true
380 |         permillage: 32
381 |         displayRangeStart: 0
382 |         displayRangeEnd: 8
383 |         validRangeStart: 0
384 |         validRangeEnd: 8
385 | 
386 |     - description: "Handle General Punctuation Characters with visible spaces(u2000-200A)"
387 |       text: "This is a tweet with general punctuation characters: \u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u200C\u200D ‐ ‑ ‒ – — ― ‖  ‗ ‘ ’ ‚ ‛ “ ” „ ‟ ′ ″ ‴ ‵ ‶ ‷"
388 |       expected:
389 |         weightedLength: 112
390 |         valid: true
391 |         permillage: 400
392 |         displayRangeStart: 0
393 |         displayRangeEnd: 111
394 |         validRangeStart: 0
395 |         validRangeEnd: 111
396 | 
397 |     - description: "Handle long url with invalid domain labels and short url"
398 |       text: "Long url with invalid domain labels and a short url: https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://validurl.com"
399 |       expected:
400 |         weightedLength: 12079
401 |         valid: false
402 |         permillage: 43139
403 |         displayRangeStart: 0
404 |         displayRangeEnd: 12075
405 |         validRangeStart: 0
406 |         validRangeEnd: 279
407 | 
408 |     - description: "Handle a 64 character domain without protocol"
409 |       text: "randomurlrandomurlrandomurlrandomurlrandomurlrandomurlrandomurls.com"
410 |       expected:
411 |         weightedLength: 68
412 |         valid: true
413 |         permillage: 242
414 |         displayRangeStart: 0
415 |         displayRangeEnd: 67
416 |         validRangeStart: 0
417 |         validRangeEnd: 67
418 | 
419 |     - description: "Do not allow > 140 CJK characters by virtue of CJK chars greater than 63 punycode encoded chars in the host"
420 |       text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいう.com"
421 |       expected:
422 |         weightedLength: 358
423 |         valid: false
424 |         permillage: 1278
425 |         displayRangeStart: 0
426 |         displayRangeEnd: 184
427 |         validRangeStart: 0
428 |         validRangeEnd: 143
429 | 
430 |     - description: "Allow > 140 CJK characters by virtue of CJK chars less than 63 punycode encoded chars in the host"
431 |       text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあい.com"
432 |       expected:
433 |         weightedLength: 264
434 |         valid: true
435 |         permillage: 942
436 |         displayRangeStart: 0
437 |         displayRangeEnd: 183
438 |         validRangeStart: 0
439 |         validRangeEnd: 183
440 | 
441 |   WeightedTweetsWithDiscountedEmojiCounterTest:
442 |     - description: "Regular Tweet with url"
443 |       text: "Hi http://test.co"
444 |       expected:
445 |         weightedLength: 26
446 |         valid: true
447 |         permillage: 92
448 |         displayRangeStart: 0
449 |         displayRangeEnd: 16
450 |         validRangeStart: 0
451 |         validRangeEnd: 16
452 | 
453 |     - description: "Just url"
454 |       text: "http://test.co"
455 |       expected:
456 |         weightedLength: 23
457 |         valid: true
458 |         permillage: 82
459 |         displayRangeStart: 0
460 |         displayRangeEnd: 13
461 |         validRangeStart: 0
462 |         validRangeEnd: 13
463 | 
464 |     - description: "Long tweet, overflow at char index 280"
465 |       text: "285 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-"
466 |       expected:
467 |         weightedLength: 285
468 |         valid: false
469 |         permillage: 1017
470 |         displayRangeStart: 0
471 |         displayRangeEnd: 284
472 |         validRangeStart: 0
473 |         validRangeEnd: 279
474 | 
475 |     - description: "Long tweet with url in the middle, overflow at char index 284"
476 |       text: "285 chars- http://www.twitter.com/jack xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-"
477 |       expected:
478 |         weightedLength: 299
479 |         valid: false
480 |         permillage: 1067
481 |         displayRangeStart: 0
482 |         displayRangeEnd: 302
483 |         validRangeStart: 0
484 |         validRangeEnd: 283
485 | 
486 |     - description: "Long tweet with url at the end, overflow at char index 265"
487 |       text: "xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx- http://www.twitter.com/jack "
488 |       expected:
489 |         weightedLength: 289
490 |         valid: false
491 |         permillage: 1032
492 |         displayRangeStart: 0
493 |         displayRangeEnd: 292
494 |         validRangeStart: 0
495 |         validRangeEnd: 264
496 | 
497 |     - description: "10 url string, no overflow"
498 |       text: "https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha "
499 |       expected:
500 |         weightedLength: 240
501 |         valid: true
502 |         permillage: 857
503 |         displayRangeStart: 0
504 |         displayRangeEnd: 299
505 |         validRangeStart: 0
506 |         validRangeEnd: 299
507 | 
508 |     - description: "160 CJK char, overflow at char index 140"
509 |       text: "故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。朱雀桥边野草花，乌衣巷口夕阳斜。旧时王谢堂前燕，飞入寻常百姓家。朝辞白帝彩云间，千里江陵一日还。两岸猿声啼不住，轻舟已过万重山。泪湿罗巾梦不成，夜深前殿按歌声。红颜未老恩先断，斜倚薰笼坐到明。独在异乡为异客，每逢佳节倍思亲。遥知兄弟登高处，遍插茱萸少一人。"
510 |       expected:
511 |         weightedLength: 320
512 |         valid: false
513 |         permillage: 1142
514 |         displayRangeStart: 0
515 |         displayRangeEnd: 159
516 |         validRangeStart: 0
517 |         validRangeEnd: 139
518 | 
519 |     - description: "160 emoji char, overflow at char index 140"
520 |       text: "😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷"
521 |       expected:
522 |         weightedLength: 320
523 |         valid: false
524 |         permillage: 1142
525 |         displayRangeStart: 0
526 |         displayRangeEnd: 319
527 |         validRangeStart: 0
528 |         validRangeEnd: 279
529 | 
530 |     - description: "3 latin char + 160 CJK char, overflow at char index 141"
531 |       text: "the故人西辞黄鹤楼，烟花三月下扬州。孤帆远影碧空尽，唯见长江天际流。朱雀桥边野草花，乌衣巷口夕阳斜。旧时王谢堂前燕，飞入寻常百姓家。朝辞白帝彩云间，千里江陵一日还。两岸猿声啼不住，轻舟已过万重山。泪湿罗巾梦不成，夜深前殿按歌声。红颜未老恩先断，斜倚薰笼坐到明。独在异乡为异客，每逢佳节倍思亲。遥知兄弟登高处，遍插茱萸少一人。"
532 |       expected:
533 |         weightedLength: 323
534 |         valid: false
535 |         permillage: 1153
536 |         displayRangeStart: 0
537 |         displayRangeEnd: 162
538 |         validRangeStart: 0
539 |         validRangeEnd: 140
540 | 
541 |     - description: "282 chars with a normalized character within valid range but outside 280"
542 |       text: "282 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxÁx"
543 |       expected:
544 |         weightedLength: 281
545 |         valid: false
546 |         permillage: 1003
547 |         displayRangeStart: 0
548 |         displayRangeEnd: 281
549 |         validRangeStart: 0
550 |         validRangeEnd: 280
551 | 
552 |     - description: "Count a mix of single byte single word, and double word unicode characters"
553 |       text: "H🐱☺👨‍👩‍👧‍👦"
554 |       expected:
555 |         weightedLength: 7
556 |         valid: true
557 |         permillage: 25
558 |         displayRangeStart: 0
559 |         displayRangeEnd: 14
560 |         validRangeStart: 0
561 |         validRangeEnd: 14
562 | 
563 |     - description: "Count unicode emoji chars inside the basic multilingual plane"
564 |       text: "😷👾😡🔥💩"
565 |       expected:
566 |         weightedLength: 10
567 |         valid: true
568 |         permillage: 35
569 |         displayRangeStart: 0
570 |         displayRangeEnd: 9
571 |         validRangeStart: 0
572 |         validRangeEnd: 9
573 | 
574 |     - description: "Count unicode emoji chars outside the basic multilingual plane with skin tone modifiers"
575 |       text: "🙋🏽👨‍🎤"
576 |       expected:
577 |         weightedLength: 4
578 |         valid: true
579 |         permillage: 14
580 |         displayRangeStart: 0
581 |         displayRangeEnd: 8
582 |         validRangeStart: 0
583 |         validRangeEnd: 8
584 | 
585 |     - description: "Handle General Punctuation Characters with visible spaces(u2000-200A), no ZWJ/ZWNJ"
586 |       text: "This is a tweet with general punctuation characters: \u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B ‐ ‑ ‒ – — ― ‖  ‗ ‘ ’ ‚ ‛ “ ” „ ‟ ′ ″ ‴ ‵ ‶ ‷"
587 |       expected:
588 |         weightedLength: 110
589 |         valid: true
590 |         permillage: 392
591 |         displayRangeStart: 0
592 |         displayRangeEnd: 109
593 |         validRangeStart: 0
594 |         validRangeEnd: 109
595 | 
596 |     - description: "Handle long url with invalid domain labels and short url"
597 |       text: "Long url with invalid domain labels and a short url: https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://validurl.com"
598 |       expected:
599 |         weightedLength: 12079
600 |         valid: false
601 |         permillage: 43139
602 |         displayRangeStart: 0
603 |         displayRangeEnd: 12075
604 |         validRangeStart: 0
605 |         validRangeEnd: 279
606 | 
607 |     - description: "Handle a 64 character domain without protocol"
608 |       text: "randomurlrandomurlrandomurlrandomurlrandomurlrandomurlrandomurls.com"
609 |       expected:
610 |         weightedLength: 68
611 |         valid: true
612 |         permillage: 242
613 |         displayRangeStart: 0
614 |         displayRangeEnd: 67
615 |         validRangeStart: 0
616 |         validRangeEnd: 67
617 | 
618 |     - description: "Do not allow > 140 CJK characters by virtue of CJK chars greater than 63 punycode encoded chars in the host"
619 |       text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいう.com"
620 |       expected:
621 |         weightedLength: 358
622 |         valid: false
623 |         permillage: 1278
624 |         displayRangeStart: 0
625 |         displayRangeEnd: 184
626 |         validRangeStart: 0
627 |         validRangeEnd: 143
628 | 
629 |     - description: "Allow > 140 CJK characters by virtue of CJK chars less than 63 punycode encoded chars in the host"
630 |       text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあい.com"
631 |       expected:
632 |         weightedLength: 264
633 |         valid: true
634 |         permillage: 942
635 |         displayRangeStart: 0
636 |         displayRangeEnd: 183
637 |         validRangeStart: 0
638 |         validRangeEnd: 183
639 | 
640 |     - description: "140 family emoji"
641 |       text: "👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦"
642 |       expected:
643 |         weightedLength: 280
644 |         valid: true
645 |         permillage: 1000
646 |         displayRangeStart: 0
647 |         displayRangeEnd: 1539
648 |         validRangeStart: 0
649 |         validRangeEnd: 1539
650 | 
651 |     - description: "Emoji with a leading character in the latin range is counted as 2"
652 |       text: "1⃣"
653 |       expected:
654 |         weightedLength: 2
655 |         valid: true
656 |         permillage: 7
657 |         displayRangeStart: 0
658 |         displayRangeEnd: 1
659 |         validRangeStart: 0
660 |         validRangeEnd: 1
661 | 
662 |     - description: "Unicode 10.0 emoji"
663 |       text: "Unicode 10.0 emoji: 🤪; 🧕; 🧕🏾; 🏴󠁧󠁢󠁥󠁮󠁧󠁿"
664 |       expected:
665 |         weightedLength: 34
666 |         valid: true
667 |         permillage: 121
668 |         displayRangeStart: 0
669 |         displayRangeEnd: 47
670 |         validRangeStart: 0
671 |         validRangeEnd: 47
672 | 
673 |     - description: "Unicode 9.0 emoji"
674 |       text: "Unicode 9.0 emoji: 🤠; 💃; 💃🏾"
675 |       expected:
676 |         weightedLength: 29
677 |         valid: true
678 |         permillage: 103
679 |         displayRangeStart: 0
680 |         displayRangeEnd: 30
681 |         validRangeStart: 0
682 |         validRangeEnd: 30
683 | 
684 |   UnicodeDirectionalMarkerCounterTest:
685 |     - description: "Handle invalid characters"
686 |       text: "ABC\u202A\uFFFFABC\uFFFE"
687 |       expected:
688 |         weightedLength: 12
689 |         valid: false
690 |         permillage: 42
691 |         displayRangeStart: 0
692 |         displayRangeEnd: 8
693 |         validRangeStart: 0
694 |         validRangeEnd: 3
695 | 
696 |     - description: "Tweet text containing directional characters should be considered valid"
697 |       text: "\U00002066\U0000202Ahttp://foobar.پاکستان/\U0000202C\U00002069"
698 |       expected:
699 |         weightedLength: 31
700 |         valid: true
701 |         permillage: 110
702 |         displayRangeStart: 0
703 |         displayRangeEnd: 25
704 |         validRangeStart: 0
705 |         validRangeEnd: 25
706 | 


--------------------------------------------------------------------------------