├── tests ├── __init__.py ├── test_added.py ├── test_conformance.py └── cases │ ├── added.yml │ ├── extract.yml │ └── validate.yml ├── twitter_text ├── regexp │ ├── __init__.py │ ├── invalid_chars_group.py │ ├── punct.py │ ├── valid_port_number.py │ ├── valid_punycode.py │ ├── cyrillic_letters_and_marks.py │ ├── spaces_group.py │ ├── directional_markers_group.py │ ├── valid_url_query_ending_chars.py │ ├── valid_url_query_chars.py │ ├── invalid_url_without_protocol_preceding_chars.py │ ├── latin_accent_chars.py │ ├── invalid_chars.py │ ├── valid_domain_chars.py │ ├── valid_domain_name.py │ ├── valid_subdomain.py │ ├── valid_url_preceding_chars.py │ ├── valid_tco_url.py │ ├── valid_general_url_path_chars.py │ ├── invalid_domain_chars.py │ ├── valid_ascii_domain.py │ ├── valid_domain.py │ ├── valid_url_balanced_parens.py │ ├── valid_url_path_ending_chars.py │ ├── valid_url_path.py │ ├── extract_url.py │ ├── valid_cctld.py │ ├── emoji.py │ └── valid_gtld.py ├── has_invalid_characters.py ├── __init__.py ├── regex_supplant.py ├── extract_emojis.py ├── get_character_weight.py ├── config.py ├── extract_urls.py └── parse_tweet.py ├── pytest.ini ├── .gitignore ├── tox.ini ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report_ja.md │ └── bug_report_en.md └── workflows │ ├── release.yml │ └── test.yml ├── .readthedocs.yml ├── doc ├── index.rst └── conf.py ├── CHANGELOG.md ├── Makefile ├── LICENSE ├── .devcontainer └── devcontainer.json ├── pyproject.toml ├── README.rst └── poetry.lock /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /twitter_text/regexp/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | -------------------------------------------------------------------------------- /twitter_text/regexp/invalid_chars_group.py: -------------------------------------------------------------------------------- 1 | invalid_chars_group = r'\uFFFE\uFEFF\uFFFF' 2 | -------------------------------------------------------------------------------- /twitter_text/regexp/punct.py: -------------------------------------------------------------------------------- 1 | punct = r"\!'#%&'\(\)*\+,\\\-\.\/:;<=>\?@\[\]\^_{|}~\$/" 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/ 2 | /venv/ 3 | __pycache__/ 4 | /dist/ 5 | /build/ 6 | *.egg-info 7 | /.tox/ 8 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_port_number.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_port_number = re.compile(r'[0-9]+') 4 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_punycode.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_punycode = re.compile(r'(?:xn--[\-0-9a-z]+)') 4 | -------------------------------------------------------------------------------- /twitter_text/regexp/cyrillic_letters_and_marks.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | cyrillic_letters_and_marks = re.compile(r'\u0400-\u04FF') 4 | -------------------------------------------------------------------------------- /twitter_text/regexp/spaces_group.py: -------------------------------------------------------------------------------- 1 | spaces_group = r'\x09-\x0D\x20\x85\xA0\u1680\u180E\u2000-\u200A\u2028\u2029\u202F\u205F\u3000' 2 | -------------------------------------------------------------------------------- /twitter_text/regexp/directional_markers_group.py: -------------------------------------------------------------------------------- 1 | directional_markers_group = r'\u202A-\u202E\u061C\u200E\u200F\u2066\u2067\u2068\u2069' 2 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_url_query_ending_chars.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_url_query_ending_chars = re.compile(r'[a-z0-9\-_&=#/]', re.IGNORECASE) 4 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py37, py38, py39, py310, py311 3 | 4 | [testenv] 5 | commands = 6 | pytest 7 | deps = 8 | pytest 9 | PyYAML 10 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_url_query_chars.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_url_query_chars = re.compile(r"[a-z0-9!?*'@();:&=+$/%#\[\]\-_.,~|]", re.IGNORECASE) 4 | -------------------------------------------------------------------------------- /twitter_text/regexp/invalid_url_without_protocol_preceding_chars.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | invalid_url_without_protocol_preceding_chars = re.compile(r'[-_./]$') 4 | -------------------------------------------------------------------------------- /twitter_text/has_invalid_characters.py: -------------------------------------------------------------------------------- 1 | from .regexp.invalid_chars import invalid_chars 2 | 3 | 4 | def has_invalid_characters(text: str) -> bool: 5 | return invalid_chars.search(text) is not None 6 | -------------------------------------------------------------------------------- /twitter_text/regexp/latin_accent_chars.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | latin_accent_chars = re.compile( 4 | r'\xC0-\xD6\xD8-\xF6\xF8-\xFF\u0100-\u024F\u0253\u0254\u0256\u0257\u0259\u025B\u0263\u0268\u026F\u0272\u0289\u028B\u02BB\u0300-\u036F\u1E00-\u1EFF' 5 | ) 6 | -------------------------------------------------------------------------------- /twitter_text/regexp/invalid_chars.py: -------------------------------------------------------------------------------- 1 | from .invalid_chars_group import invalid_chars_group 2 | from ..regex_supplant import regex_supplant 3 | 4 | invalid_chars = regex_supplant( 5 | r'[#{invalid_chars_group}]', 6 | {'invalid_chars_group': invalid_chars_group} 7 | ) 8 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_domain_chars.py: -------------------------------------------------------------------------------- 1 | from .invalid_domain_chars import invalid_domain_chars 2 | from ..regex_supplant import regex_supplant 3 | 4 | valid_domain_chars = regex_supplant( 5 | r'[^#{invalid_domain_chars}]', 6 | { 7 | 'invalid_domain_chars': invalid_domain_chars 8 | } 9 | ) 10 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_domain_name.py: -------------------------------------------------------------------------------- 1 | from .valid_domain_chars import valid_domain_chars 2 | from ..regex_supplant import regex_supplant 3 | 4 | valid_domain_name = regex_supplant( 5 | r'(?:(?:#{valid_domain_chars}(?:-|#{valid_domain_chars})*)?#{valid_domain_chars}\.)', 6 | { 7 | 'valid_domain_chars': valid_domain_chars 8 | } 9 | ) 10 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_subdomain.py: -------------------------------------------------------------------------------- 1 | from .valid_domain_chars import valid_domain_chars 2 | from ..regex_supplant import regex_supplant 3 | 4 | valid_subdomain = regex_supplant( 5 | r'(?:(?:#{valid_domain_chars}(?:[_-]|#{valid_domain_chars})*)?#{valid_domain_chars}\.)', 6 | { 7 | 'valid_domain_chars': valid_domain_chars 8 | } 9 | ) 10 | -------------------------------------------------------------------------------- /twitter_text/__init__.py: -------------------------------------------------------------------------------- 1 | from twitter_text.parse_tweet import parse_tweet, ParsedResult 2 | from twitter_text.extract_urls import extract_urls, extract_urls_with_indices 3 | from twitter_text.extract_emojis import extract_emojis_with_indices 4 | 5 | __all__ = [ 6 | 'ParsedResult', 7 | 'parse_tweet', 8 | 'extract_urls', 9 | 'extract_urls_with_indices', 10 | 'extract_emojis_with_indices', 11 | ] 12 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report_ja.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: バグ報告 3 | about: 日本語でのバグ報告 4 | title: "[BUG] " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **バグの概要** 11 | 12 | **本来期待される動作** 13 | 14 | **バグを再現する手順** 15 | 1. '...' を開く 16 | 2. '...' をクリック 17 | 3. '....' のところまでスクロール 18 | 4. エラーが発生 19 | 20 | **環境:** 21 | - OS: [例) macOS Mojave version 10.14.6] 22 | - Python バージョン: [例) 3.7] 23 | - パッケージバージョン [例) twitter-text-python==1.0.2] 24 | 25 | **その他の情報** 26 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_url_preceding_chars.py: -------------------------------------------------------------------------------- 1 | from .directional_markers_group import directional_markers_group 2 | from .invalid_chars_group import invalid_chars_group 3 | from ..regex_supplant import regex_supplant 4 | 5 | valid_url_preceding_chars = regex_supplant( 6 | r'(?:[^A-Za-z0-9@@$###{invalid_chars_group}]|[#{directional_markers_group}]|^)', 7 | { 8 | 'invalid_chars_group': invalid_chars_group, 9 | 'directional_markers_group': directional_markers_group 10 | } 11 | ) 12 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | build: 9 | os: ubuntu-22.04 10 | tools: 11 | python: "3.11" 12 | jobs: 13 | post_create_environment: 14 | - pip install poetry 15 | post_install: 16 | - poetry install --with docs 17 | 18 | # Build documentation in the doc/ directory with Sphinx 19 | sphinx: 20 | configuration: doc/conf.py 21 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. twitter-text-python documentation master file, created by 2 | sphinx-quickstart on Fri Jul 26 22:54:53 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Contents: 10 | 11 | 12 | twitter-text-python 13 | =================== 14 | 15 | .. include:: ../README.rst 16 | :start-line: 7 17 | 18 | 19 | API References 20 | ============== 21 | 22 | .. automodule:: twitter_text 23 | :members: 24 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | 8 | jobs: 9 | release: 10 | name: Build and release to PyPI 11 | runs-on: ubuntu-latest 12 | permissions: 13 | id-token: write 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - uses: actions/setup-python@v4 18 | with: 19 | python-version: 3.x 20 | - uses: snok/install-poetry@v1 21 | - run: poetry install --no-root -v 22 | - run: poetry build 23 | - uses: pypa/gh-action-pypi-publish@release/v1 24 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_tco_url.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .valid_url_query_chars import valid_url_query_chars 4 | from .valid_url_query_ending_chars import valid_url_query_ending_chars 5 | from ..regex_supplant import regex_supplant 6 | 7 | valid_tco_url = regex_supplant( 8 | r'^https?:\/\/t\.co\/([a-z0-9]+)(?:\?#{valid_url_query_chars}*#{valid_url_query_ending_chars})?', 9 | { 10 | 'valid_url_query_chars': valid_url_query_chars, 11 | 'valid_url_query_ending_chars': valid_url_query_ending_chars 12 | }, 13 | re.IGNORECASE 14 | ) 15 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] 15 | 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-python@v4 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - uses: snok/install-poetry@v1 22 | - run: poetry install --no-root -v 23 | - run: poetry run pytest 24 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_general_url_path_chars.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .cyrillic_letters_and_marks import cyrillic_letters_and_marks 4 | from .latin_accent_chars import latin_accent_chars 5 | from ..regex_supplant import regex_supplant 6 | 7 | valid_general_url_path_chars = regex_supplant( 8 | re.compile( 9 | r"[a-z#{cyrillic_letters_and_marks}0-9!\*';:=\+,\.\$\/%#\[\]\-\u2013_~@\|&#{latin_accent_chars}]", 10 | re.IGNORECASE 11 | ), 12 | { 13 | 'cyrillic_letters_and_marks': cyrillic_letters_and_marks, 14 | 'latin_accent_chars': latin_accent_chars 15 | } 16 | ) 17 | -------------------------------------------------------------------------------- /twitter_text/regexp/invalid_domain_chars.py: -------------------------------------------------------------------------------- 1 | from .directional_markers_group import directional_markers_group 2 | from .invalid_chars_group import invalid_chars_group 3 | from .punct import punct 4 | from .spaces_group import spaces_group 5 | from ..regex_supplant import regex_supplant 6 | 7 | invalid_domain_chars = regex_supplant( 8 | r'#{punct}#{spaces_group}#{invalid_chars_group}#{directional_markers_group}', 9 | { 10 | 'punct': punct, 11 | 'spaces_group': spaces_group, 12 | 'invalid_chars_group': invalid_chars_group, 13 | 'directional_markers_group': directional_markers_group 14 | } 15 | ) 16 | -------------------------------------------------------------------------------- /twitter_text/regex_supplant.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, Union, Pattern, Match 3 | 4 | 5 | def regex_supplant(regex: Union[str, Pattern], dic: Dict[str, Union[str, Pattern]], flags=0) -> Pattern: 6 | def repl(match: Match) -> str: 7 | name = match.group(1) 8 | pattern = dic.get(name, '') 9 | return pattern if isinstance(pattern, str) else pattern.pattern 10 | 11 | regex_str = regex if isinstance(regex, str) else regex.pattern 12 | new_flags = flags if isinstance(regex, str) else regex.flags | flags 13 | assembled_pat = re.sub(r'#\{(\w+)\}', repl, regex_str) 14 | 15 | return re.compile(assembled_pat, new_flags) 16 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_ascii_domain.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .latin_accent_chars import latin_accent_chars 4 | from .valid_cctld import valid_cctld 5 | from .valid_gtld import valid_gtld 6 | from .valid_punycode import valid_punycode 7 | from ..regex_supplant import regex_supplant 8 | 9 | valid_ascii_domain = regex_supplant( 10 | re.compile( 11 | r'(?:(?:[\-a-z0-9#{latin_accent_chars}]+)\.)+(?:#{valid_gtld}|#{valid_cctld}|#{valid_punycode})', 12 | re.IGNORECASE 13 | ), 14 | { 15 | 'latin_accent_chars': latin_accent_chars, 16 | 'valid_gtld': valid_gtld, 17 | 'valid_cctld': valid_cctld, 18 | 'valid_punycode': valid_punycode 19 | } 20 | ) 21 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_domain.py: -------------------------------------------------------------------------------- 1 | from .valid_cctld import valid_cctld 2 | from .valid_domain_name import valid_domain_name 3 | from .valid_gtld import valid_gtld 4 | from .valid_punycode import valid_punycode 5 | from .valid_subdomain import valid_subdomain 6 | from ..regex_supplant import regex_supplant 7 | 8 | valid_domain = regex_supplant( 9 | r'(?:#{valid_subdomain}*#{valid_domain_name}(?:#{valid_gtld}|#{valid_cctld}|#{valid_punycode}))', 10 | { 11 | 'valid_subdomain': valid_subdomain, 12 | 'valid_domain_name': valid_domain_name, 13 | 'valid_gtld': valid_gtld, 14 | 'valid_cctld': valid_cctld, 15 | 'valid_punycode': valid_punycode 16 | } 17 | ) 18 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | ## [3.0.0] - 2023-05-24 3 | - [change] Drop support of Python 3.6. 4 | - [change] Support Python 3.10 and 3.11. 5 | - [change] Remove dependency on the package `attrs`. 6 | 7 | ## [2.0.1] - 2023-05-24 8 | - [fix] Loosen the version requirement of the package `attrs`. 9 | 10 | ## [2.0.0] - 2021-03-29 11 | - [change] Drop support of Python 3.5. 12 | - [change] Support Python 3.8 and 3.9. 13 | 14 | ## [1.0.2] - 2020-05-25 15 | - [fix] Loosen the version requirement of the package `attrs`. 16 | 17 | ## [1.0.1] - 2020-05-24 18 | - [fix] Fix a bug where CRLF (`\r\n`) is counted as two characters. 19 | - [fix] Prevent `UnicodeDecodeError` in Windows environment while installing the package. 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = doc 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_url_balanced_parens.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .valid_general_url_path_chars import valid_general_url_path_chars 4 | from ..regex_supplant import regex_supplant 5 | 6 | valid_url_balanced_parens = regex_supplant( 7 | '\\(' + 8 | '(?:' + 9 | '#{valid_general_url_path_chars}+' + 10 | '|' + 11 | # allow one nested level of balanced parentheses 12 | '(?:' + 13 | '#{valid_general_url_path_chars}*' + 14 | '\\(' + 15 | '#{valid_general_url_path_chars}+' + 16 | '\\)' + 17 | '#{valid_general_url_path_chars}*' + 18 | ')' + 19 | ')' + 20 | '\\)', 21 | { 22 | 'valid_general_url_path_chars': valid_general_url_path_chars 23 | }, 24 | re.IGNORECASE 25 | ) 26 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_url_path_ending_chars.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .cyrillic_letters_and_marks import cyrillic_letters_and_marks 4 | from .latin_accent_chars import latin_accent_chars 5 | from .valid_url_balanced_parens import valid_url_balanced_parens 6 | from ..regex_supplant import regex_supplant 7 | 8 | valid_url_path_ending_chars = regex_supplant( 9 | re.compile( 10 | r'[\+\-a-z#{cyrillic_letters_and_marks}0-9=_#\/#{latin_accent_chars}]|(?:#{valid_url_balanced_parens})', 11 | re.IGNORECASE 12 | ), 13 | { 14 | 'cyrillic_letters_and_marks': cyrillic_letters_and_marks, 15 | 'latin_accent_chars': latin_accent_chars, 16 | 'valid_url_balanced_parens': valid_url_balanced_parens 17 | } 18 | ) 19 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report_en.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG] " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **Expected behavior** 14 | A clear and concise description of what you expected to happen. 15 | 16 | **To Reproduce** 17 | Steps to reproduce the behavior: 18 | 1. Go to '...' 19 | 2. Click on '....' 20 | 3. Scroll down to '....' 21 | 4. See error 22 | 23 | **Environment:** 24 | - OS: [e.g. macOS Mojave version 10.14.6] 25 | - Python version: [e.g. 3.7] 26 | - Package version [e.g. twitter-text-python==1.0.2] 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /twitter_text/extract_emojis.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from .regexp.emoji import emoji 4 | 5 | 6 | def extract_emojis_with_indices(text: str) -> List[dict]: 7 | """ 8 | Extract emojis present in ``text`` along with their Unicode code point indices. 9 | 10 | >>> extract_emojis_with_indices('text 😷') 11 | {'emoji': '😷', 'indices': [5, 6]} 12 | 13 | >>> extract_emojis_with_indices('🙋🏽👨‍🎤') 14 | [{'emoji': '🙋🏽', 'indices': [0, 2]}, {'emoji': '👨\u200d🎤', 'indices': [2, 5]}] 15 | """ 16 | def generator(): 17 | for match in emoji.finditer(text): 18 | yield { 19 | 'emoji': match.group(0), 20 | 'indices': [match.start(), match.end()] 21 | } 22 | 23 | return list(generator()) 24 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_url_path.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from .valid_general_url_path_chars import valid_general_url_path_chars 4 | from .valid_url_balanced_parens import valid_url_balanced_parens 5 | from .valid_url_path_ending_chars import valid_url_path_ending_chars 6 | from ..regex_supplant import regex_supplant 7 | 8 | valid_url_path = regex_supplant( 9 | '(?:' + 10 | '(?:' + 11 | '#{valid_general_url_path_chars}*' + 12 | '(?:#{valid_url_balanced_parens}#{valid_general_url_path_chars}*)*' + 13 | '#{valid_url_path_ending_chars}' + 14 | ')|(?:@#{valid_general_url_path_chars}+/)' + 15 | ')', 16 | { 17 | 'valid_general_url_path_chars': valid_general_url_path_chars, 18 | 'valid_url_balanced_parens': valid_url_balanced_parens, 19 | 'valid_url_path_ending_chars': valid_url_path_ending_chars 20 | }, 21 | re.IGNORECASE 22 | ) 23 | -------------------------------------------------------------------------------- /twitter_text/get_character_weight.py: -------------------------------------------------------------------------------- 1 | def get_character_weight(char: str, options: dict) -> int: 2 | """ 3 | Return an integer weight corresponding to `char`. 4 | The weight is determined by the Unicode code point of `char` and ranges specified by `options`. 5 | 6 | >>> char = '日' 7 | >>> options = { 8 | ... 'default_weight': 200, 9 | ... 'ranges': [ 10 | ... { 'start': 0, 'end': 4351, 'weight': 100 }, 11 | ... { 'start': 8192, 'end': 8205, 'weight': 100 } 12 | ... ] 13 | >>> get_character_weight(char, options) 14 | 200 15 | """ 16 | ranges = options['ranges'] 17 | char_code_point = ord(char[0]) 18 | match = [range['weight'] for range in ranges if range['start'] <= char_code_point <= range['end']] 19 | weight = match[0] if match != [] else options['default_weight'] 20 | 21 | return weight 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2019 swen128 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | -------------------------------------------------------------------------------- /tests/test_added.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import pytest 4 | import yaml 5 | 6 | from twitter_text import parse_tweet, extract_urls_with_indices 7 | 8 | 9 | def read_yaml(path) -> dict: 10 | with open(path, mode='r', encoding='utf-8') as f: 11 | return yaml.safe_load(f) 12 | 13 | 14 | def get_table(test_cases: dict, group_name: str) -> Tuple[str, List[list]]: 15 | header = ",".join(test_cases[group_name][0].keys()) 16 | values = [list(case.values()) for case in test_cases[group_name]] 17 | return header, values 18 | 19 | 20 | def parametrize(test_cases: dict, group_name: str): 21 | return pytest.mark.parametrize(*get_table(test_cases, group_name)) 22 | 23 | 24 | added = read_yaml('tests/cases/added.yml')['tests'] 25 | 26 | 27 | @parametrize(added, 'ParseTweet') 28 | def test_added_parse_tweet(description: str, text: str, expected: dict): 29 | assert parse_tweet(text).asdict() == expected 30 | 31 | 32 | @parametrize(added, 'ExtractUrlsWithIndices') 33 | def test_added_extract_urls_with_indices(description: str, text: str, expected: dict): 34 | assert extract_urls_with_indices(text) == expected 35 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python 3 | { 4 | "name": "Python 3", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/python:0-3.11-bullseye", 7 | "features": { 8 | "ghcr.io/devcontainers-contrib/features/poetry:2": { 9 | "version": "latest" 10 | }, 11 | "ghcr.io/devcontainers-contrib/features/tox:2": { 12 | "version": "latest" 13 | } 14 | }, 15 | 16 | // Features to add to the dev container. More info: https://containers.dev/features. 17 | // "features": {}, 18 | 19 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 20 | // "forwardPorts": [], 21 | 22 | // Use 'postCreateCommand' to run commands after the container is created. 23 | "postCreateCommand": "poetry install --with docs" 24 | 25 | // Configure tool-specific properties. 26 | // "customizations": {}, 27 | 28 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 29 | // "remoteUser": "root" 30 | } 31 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "twitter-text-parser" 3 | version = "3.0.0" 4 | description = "A library to parse or validate Twitter texts properly" 5 | authors = ["swen128 "] 6 | readme = "README.rst" 7 | repository = "https://github.com/swen128/twitter-text-python" 8 | packages = [ 9 | {include = "twitter_text"}, 10 | ] 11 | classifiers = [ 12 | "Programming Language :: Python :: 3.7", 13 | "Programming Language :: Python :: 3.8", 14 | "Programming Language :: Python :: 3.9", 15 | "Programming Language :: Python :: 3.10", 16 | "Programming Language :: Python :: 3.11", 17 | "License :: OSI Approved :: MIT License", 18 | "Operating System :: OS Independent", 19 | "Intended Audience :: Developers", 20 | "Topic :: Text Processing" 21 | ] 22 | 23 | [tool.poetry.dependencies] 24 | python = "^3.7" 25 | 26 | [tool.poetry.group.test.dependencies] 27 | pyyaml = "^6.0" 28 | pytest = "^7.3.1" 29 | 30 | [tool.poetry.group.docs] 31 | optional = true 32 | 33 | [tool.poetry.group.docs.dependencies] 34 | sphinx = {version = "^6.2.1", python = "^3.8"} 35 | docutils = "^0.18" 36 | sphinx-rtd-theme = "^1.2.1" 37 | 38 | [build-system] 39 | requires = ["poetry-core>=1.0.0"] 40 | build-backend = "poetry.core.masonry.api" 41 | -------------------------------------------------------------------------------- /twitter_text/regexp/extract_url.py: -------------------------------------------------------------------------------- 1 | import re 2 | from .valid_domain import valid_domain 3 | from .valid_url_query_chars import valid_url_query_chars 4 | from .valid_url_query_ending_chars import valid_url_query_ending_chars 5 | from .valid_port_number import valid_port_number 6 | from .valid_url_path import valid_url_path 7 | from .valid_url_preceding_chars import valid_url_preceding_chars 8 | from ..regex_supplant import regex_supplant 9 | 10 | extract_url = regex_supplant( 11 | '(' + # $1 total match 12 | '(#{valid_url_preceding_chars})' + # $2 Preceding character 13 | '(' + # $3 URL 14 | '(https?:\\/\\/)?' + # $4 Protocol (optional) 15 | '(#{valid_domain})' + # $5 Domain(s) 16 | '(?::(#{valid_port_number}))?' + # $6 Port number (optional) 17 | '(\\/#{valid_url_path}*)?' + # $7 URL Path 18 | '(\\?#{valid_url_query_chars}*#{valid_url_query_ending_chars})?' + # $8 Query String 19 | ')' + 20 | ')', 21 | { 22 | 'valid_domain': valid_domain, 23 | 'valid_url_query_chars': valid_url_query_chars, 24 | 'valid_url_query_ending_chars': valid_url_query_ending_chars, 25 | 'valid_port_number': valid_port_number, 26 | 'valid_url_path': valid_url_path, 27 | 'valid_url_preceding_chars': valid_url_preceding_chars 28 | }, 29 | re.IGNORECASE 30 | ) 31 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_cctld.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_cctld = re.compile( 4 | '(?:(?:' + 5 | '한국|香港|澳門|新加坡|台灣|台湾|中國|中国|გე|ລາວ|ไทย|ලංකා|ഭാരതം|ಭಾರತ|భారత్|சிங்கப்பூர்|இலங்கை|இந்தியா|ଭାରତ|' + 6 | 'ભારત|ਭਾਰਤ|ভাৰত|ভারত|বাংলা|भारोत|भारतम्|भारत|ڀارت|پاکستان|موريتانيا|مليسيا|مصر|قطر|فلسطين|عمان|' + 7 | 'عراق|سورية|سودان|تونس|بھارت|بارت|ایران|امارات|المغرب|السعودية|الجزائر|البحرين|الاردن|հայ|қаз|' + 8 | 'укр|срб|рф|мон|мкд|ею|бел|бг|ευ|ελ|zw|zm|za|yt|ye|ws|wf|vu|vn|vi|vg|ve|vc|va|uz|uy|us|um|uk|' + 9 | 'ug|ua|tz|tw|tv|tt|tr|tp|to|tn|tm|tl|tk|tj|th|tg|tf|td|tc|sz|sy|sx|sv|su|st|ss|sr|so|sn|sm|sl|' + 10 | 'sk|sj|si|sh|sg|se|sd|sc|sb|sa|rw|ru|rs|ro|re|qa|py|pw|pt|ps|pr|pn|pm|pl|pk|ph|pg|pf|pe|pa|om|' + 11 | 'nz|nu|nr|np|no|nl|ni|ng|nf|ne|nc|na|mz|my|mx|mw|mv|mu|mt|ms|mr|mq|mp|mo|mn|mm|ml|mk|mh|mg|mf|' + 12 | 'me|md|mc|ma|ly|lv|lu|lt|ls|lr|lk|li|lc|lb|la|kz|ky|kw|kr|kp|kn|km|ki|kh|kg|ke|jp|jo|jm|je|it|' + 13 | 'is|ir|iq|io|in|im|il|ie|id|hu|ht|hr|hn|hm|hk|gy|gw|gu|gt|gs|gr|gq|gp|gn|gm|gl|gi|gh|gg|gf|ge|' + 14 | 'gd|gb|ga|fr|fo|fm|fk|fj|fi|eu|et|es|er|eh|eg|ee|ec|dz|do|dm|dk|dj|de|cz|cy|cx|cw|cv|cu|cr|co|' + 15 | 'cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|bz|by|bw|bv|bt|bs|br|bq|bo|bn|bm|bl|bj|bi|bh|bg|bf|be|bd|bb|' + 16 | 'ba|az|ax|aw|au|at|as|ar|aq|ao|an|am|al|ai|ag|af|ae|ad|ac' + 17 | ')(?=[^0-9a-zA-Z@+-]|$))' 18 | ) 19 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | import sphinx_rtd_theme 17 | 18 | sys.path.insert(0, os.path.abspath('../')) 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'twitter-text-python' 23 | copyright = '2019, swen128' 24 | author = 'swen128' 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.autodoc' 33 | ] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ['_templates'] 37 | 38 | # List of patterns, relative to source directory, that match files and 39 | # directories to ignore when looking for source files. 40 | # This pattern also affects html_static_path and html_extra_path. 41 | exclude_patterns = [] 42 | 43 | # -- Options for HTML output ------------------------------------------------- 44 | 45 | # The theme to use for HTML and HTML Help pages. See the documentation for 46 | # a list of builtin themes. 47 | # 48 | html_theme = 'sphinx_rtd_theme' 49 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 50 | 51 | # Add any paths that contain custom static files (such as style sheets) here, 52 | # relative to this directory. They are copied after the builtin static files, 53 | # so a file named "default.css" will overwrite the builtin "default.css". 54 | html_static_path = ['_static'] 55 | 56 | # As suggested in https://stackoverflow.com/a/56448499 57 | master_doc = 'index' 58 | -------------------------------------------------------------------------------- /tests/test_conformance.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import pytest 4 | import yaml 5 | 6 | from twitter_text import parse_tweet, extract_urls, extract_urls_with_indices 7 | 8 | 9 | def read_yaml(path) -> dict: 10 | with open(path, mode='r', encoding='utf-8') as f: 11 | return yaml.safe_load(f) 12 | 13 | 14 | def get_table(test_cases: dict, group_name: str) -> Tuple[str, List[list]]: 15 | header = ",".join(test_cases[group_name][0].keys()) 16 | values = [list(case.values()) for case in test_cases[group_name]] 17 | return header, values 18 | 19 | 20 | def parametrize(test_cases: dict, group_name: str): 21 | return pytest.mark.parametrize(*get_table(test_cases, group_name)) 22 | 23 | 24 | extract = read_yaml('tests/cases/extract.yml')['tests'] 25 | tlds = read_yaml('tests/cases/tlds.yml')['tests'] 26 | validate = read_yaml('tests/cases/validate.yml')['tests'] 27 | 28 | 29 | @parametrize(extract, 'tco_urls_with_params') 30 | def test_extract_tco_urls_with_params(description: str, text: str, expected: List[str]): 31 | assert extract_urls(text) == expected 32 | 33 | 34 | @parametrize(extract, 'urls') 35 | def test_extract_urls(description: str, text: str, expected: List[str]): 36 | assert extract_urls(text) == expected 37 | 38 | 39 | @parametrize(tlds, 'country') 40 | def test_tlds_country(description: str, text: str, expected: List[str]): 41 | assert extract_urls(text) == expected 42 | 43 | 44 | @parametrize(extract, 'urls_with_indices') 45 | def test_extract_urls_with_indices(description: str, text: str, expected: dict): 46 | assert extract_urls_with_indices(text) == expected 47 | 48 | 49 | @parametrize(extract, 'urls_with_directional_markers') 50 | def test_extract_urls_with_directional_markers(description: str, text: str, expected: dict): 51 | assert extract_urls_with_indices(text) == expected 52 | 53 | 54 | @parametrize(validate, 'WeightedTweetsWithDiscountedEmojiCounterTest') 55 | def test_validate_weighted_tweets_with_discounted_emoji_counter_test(description: str, text: str, expected: dict): 56 | assert parse_tweet(text).asdict() == expected 57 | 58 | 59 | @parametrize(validate, 'UnicodeDirectionalMarkerCounterTest') 60 | def test_validate_unicode_directional_marker_counter_test(description: str, text: str, expected: dict): 61 | assert parse_tweet(text).asdict() == expected 62 | -------------------------------------------------------------------------------- /tests/cases/added.yml: -------------------------------------------------------------------------------- 1 | tests: 2 | ExtractUrlsWithIndices: 3 | - description: "t.co URL immediately followed by another t.co URL" 4 | text: "https://t.co/slug/https://t.co/slug" 5 | expected: 6 | - url: "https://t.co/slug" 7 | indices: [0, 17] 8 | ParseTweet: 9 | - description: "CRLF character" 10 | text: "a\r\nb" 11 | expected: 12 | weightedLength: 3 13 | valid: true 14 | permillage: 10 15 | displayRangeStart: 0 16 | displayRangeEnd: 3 17 | validRangeStart: 0 18 | validRangeEnd: 3 19 | - description: "A URL containing emojis" 20 | text: "https://😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷.jp" 21 | expected: 22 | weightedLength: 23 23 | valid: true 24 | permillage: 82 25 | displayRangeStart: 0 26 | displayRangeEnd: 62 27 | validRangeStart: 0 28 | validRangeEnd: 62 29 | - description: "Hangul syllables such as gag (which may be a single character, or a sequence of conjoining jamos)" 30 | text: "각각" 31 | expected: 32 | weightedLength: 4 33 | valid: true 34 | permillage: 14 35 | displayRangeStart: 0 36 | displayRangeEnd: 5 37 | validRangeStart: 0 38 | validRangeEnd: 5 39 | - description: "One grapheme cluster composed of two Unicode code points (in Normalized Form C)" 40 | text: "\u1E9B\u0323" 41 | expected: 42 | weightedLength: 3 43 | valid: true 44 | permillage: 10 45 | displayRangeStart: 0 46 | displayRangeEnd: 1 47 | validRangeStart: 0 48 | validRangeEnd: 1 49 | ExtendedGraphemeClusters: 50 | - description: "Tamil 'ni'" 51 | text: "நிநி" 52 | expected: 53 | weightedLength: 4 54 | valid: true 55 | permillage: 14 56 | displayRangeStart: 0 57 | displayRangeEnd: 3 58 | validRangeStart: 0 59 | validRangeEnd: 3 60 | - description: "Thai 'e'" 61 | text: "เเ" 62 | expected: 63 | weightedLength: 2 64 | valid: true 65 | permillage: 7 66 | displayRangeStart: 0 67 | displayRangeEnd: 1 68 | validRangeStart: 0 69 | validRangeEnd: 1 70 | - description: "Devanagari letter 'ssi'" 71 | text: "षिषि" 72 | expected: 73 | weightedLength: 4 74 | valid: true 75 | permillage: 14 76 | displayRangeStart: 0 77 | displayRangeEnd: 3 78 | validRangeStart: 0 79 | validRangeEnd: 3 80 | - description: "Thai 'kam'" 81 | text: "กำกำ" 82 | expected: 83 | weightedLength: 4 84 | valid: true 85 | permillage: 14 86 | displayRangeStart: 0 87 | displayRangeEnd: 3 88 | validRangeStart: 0 89 | validRangeEnd: 3 90 | -------------------------------------------------------------------------------- /twitter_text/config.py: -------------------------------------------------------------------------------- 1 | config = { 2 | "version1": { 3 | "version": 1, 4 | "max_weighted_tweet_length": 140, 5 | "scale": 1, 6 | "default_weight": 1, 7 | "transformed_url_length": 23, 8 | "ranges": [] 9 | }, 10 | "version2": { 11 | "version": 2, 12 | "max_weighted_tweet_length": 280, 13 | "scale": 100, 14 | "default_weight": 200, 15 | "transformed_url_length": 23, 16 | "ranges": [ 17 | { 18 | "start": 0, 19 | "end": 4351, 20 | "weight": 100 21 | }, 22 | { 23 | "start": 8192, 24 | "end": 8205, 25 | "weight": 100 26 | }, 27 | { 28 | "start": 8208, 29 | "end": 8223, 30 | "weight": 100 31 | }, 32 | { 33 | "start": 8242, 34 | "end": 8247, 35 | "weight": 100 36 | } 37 | ] 38 | }, 39 | "version3": { 40 | "version": 3, 41 | "max_weighted_tweet_length": 280, 42 | "scale": 100, 43 | "default_weight": 200, 44 | "emoji_parsing_enabled": True, 45 | "transformed_url_length": 23, 46 | "ranges": [ 47 | { 48 | "start": 0, 49 | "end": 4351, 50 | "weight": 100 51 | }, 52 | { 53 | "start": 8192, 54 | "end": 8205, 55 | "weight": 100 56 | }, 57 | { 58 | "start": 8208, 59 | "end": 8223, 60 | "weight": 100 61 | }, 62 | { 63 | "start": 8242, 64 | "end": 8247, 65 | "weight": 100 66 | } 67 | ] 68 | }, 69 | "defaults": { 70 | "version": 3, 71 | "max_weighted_tweet_length": 280, 72 | "scale": 100, 73 | "default_weight": 200, 74 | "emoji_parsing_enabled": True, 75 | "transformed_url_length": 23, 76 | "ranges": [ 77 | { 78 | "start": 0, 79 | "end": 4351, 80 | "weight": 100 81 | }, 82 | { 83 | "start": 8192, 84 | "end": 8205, 85 | "weight": 100 86 | }, 87 | { 88 | "start": 8208, 89 | "end": 8223, 90 | "weight": 100 91 | }, 92 | { 93 | "start": 8242, 94 | "end": 8247, 95 | "weight": 100 96 | } 97 | ] 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | twitter-text-python 2 | =================== 3 | 4 | .. image:: https://readthedocs.org/projects/twitter-text-python/badge/?version=latest 5 | :target: https://twitter-text-python.readthedocs.io/en/latest/?badge=latest 6 | :alt: Documentation Status 7 | 8 | .. image:: https://github.com/swen128/twitter-text-python/actions/workflows/test.yml/badge.svg 9 | :target: https://github.com/swen128/twitter-text-python/actions/workflows/test.yml 10 | 11 | .. image:: https://github.com/swen128/twitter-text-python/actions/workflows/release.yml/badge.svg 12 | :target: https://github.com/swen128/twitter-text-python/actions/workflows/release.yml 13 | 14 | This is a Python port of the `twitter/twitter-text`_ libraries, fully compliant with the `official conformance test suite`_. 15 | 16 | 17 | Features 18 | ======== 19 | 20 | This library calculates length of a tweet message according to `the documentation from Twitter Developers`_, 21 | so that you can validate the tweet without calling the Web API at all. 22 | Although counting characters might seem an easy task, in actual fact it is very complicated, especially when the text contains CJK characters, URLs, or emojis. 23 | 24 | The original twitter-text libraries have *hit-highlighting* and *auto-linking* features as well, 25 | however they are not yet supported by this Python port. 26 | 27 | 28 | Usage 29 | ===== 30 | 31 | Installation 32 | ------------ 33 | 34 | .. code-block:: console 35 | 36 | $ pip install twitter-text-parser 37 | 38 | 39 | Examples 40 | -------- 41 | 42 | See `the API reference `_ for more details. 43 | 44 | .. code-block:: python 45 | 46 | from twitter_text import parse_tweet, extract_emojis_with_indices, extract_urls_with_indices 47 | 48 | text = 'english text 日本語 😷 https://example.com' 49 | 50 | assert parse_tweet(text).asdict() == { 51 | 'weightedLength': 46, 52 | 'valid': True, 53 | 'permillage': 164, 54 | 'validRangeStart': 0, 55 | 'validRangeEnd': 38, 56 | 'displayRangeStart': 0, 57 | 'displayRangeEnd': 38 58 | } 59 | 60 | assert extract_urls_with_indices(text) == [{ 61 | 'url': 'https://example.com', 62 | 'indices': [19, 38] 63 | }] 64 | 65 | assert extract_emojis_with_indices(text) == [{ 66 | 'emoji': '😷', 67 | 'indices': [17, 18] 68 | }] 69 | 70 | 71 | Related Links 72 | ============= 73 | 74 | - `twitter/twitter-text`_: The original, official twitter-text implementations for Java, Ruby, JavaScript and Objective-C 75 | - `twitter-text Parser -- Twitter Developers`_: A brief overview of the twitter-text libraries 76 | - `Counting characters -- Twitter Developers`_: An introduction to how to count characters in Twitter texts 77 | - `edmondburnett/twitter-text-python`_: Another python port of twitter-text, which is not compliant with the `official conformance test suite`_ 78 | 79 | 80 | .. _twitter/twitter-text: https://github.com/twitter/twitter-text 81 | .. _edmondburnett/twitter-text-python: https://github.com/edmondburnett/twitter-text-python 82 | .. _official conformance test suite: https://github.com/twitter/twitter-text/tree/master/conformance 83 | .. _search-api: https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html 84 | .. _Counting characters -- Twitter Developers: https://developer.twitter.com/en/docs/basics/counting-characters.html 85 | .. _the documentation from Twitter Developers: https://developer.twitter.com/en/docs/developer-utilities/twitter-text 86 | .. _twitter-text Parser -- Twitter Developers: https://developer.twitter.com/en/docs/developer-utilities/twitter-text 87 | -------------------------------------------------------------------------------- /twitter_text/extract_urls.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from .regexp.extract_url import extract_url 4 | from .regexp.invalid_url_without_protocol_preceding_chars import invalid_url_without_protocol_preceding_chars 5 | from .regexp.valid_ascii_domain import valid_ascii_domain 6 | from .regexp.valid_tco_url import valid_tco_url 7 | 8 | default_protocol = 'https://' 9 | max_url_length = 4096 10 | max_tco_slug_length = 40 11 | 12 | 13 | def extract_urls(text: str, extract_urls_without_protocol: bool = True) -> List[str]: 14 | """ 15 | Extract valid URLs present in ``text``. 16 | 17 | >>> extract_urls('http://twitter.com/これは日本語です。example.com中国語') 18 | ["url": "http://twitter.com/", "example.com"] 19 | """ 20 | return [dic['url'] for dic in extract_urls_with_indices(text, extract_urls_without_protocol)] 21 | 22 | 23 | def extract_urls_with_indices(text: str, extract_urls_without_protocol: bool = True) -> List[dict]: 24 | """ 25 | Extract valid URLs present in ``text`` along with their Unicode code point indices. 26 | 27 | >>> extract_urls_with_indices('http://twitter.com/これは日本語です。example.com中国語') 28 | [ 29 | { 30 | "url": "http://twitter.com/", 31 | "indices": [0, 19] 32 | }, 33 | { 34 | "url": "example.com", 35 | "indices": [28, 39] 36 | } 37 | ] 38 | """ 39 | if text == '' or ('.' not in text if extract_urls_without_protocol else ':' not in text): 40 | return [] 41 | 42 | urls = [] 43 | 44 | for url_match in extract_url.finditer(text): 45 | _, before, url, protocol, domain, _, path, _ = url_match.groups() 46 | end_position = url_match.end() 47 | start_position = end_position - len(url) 48 | 49 | if not is_valid_url(url, protocol or default_protocol, domain): 50 | continue 51 | 52 | # extract ASCII-only domains. 53 | if protocol is None: 54 | if not extract_urls_without_protocol or \ 55 | invalid_url_without_protocol_preceding_chars.match(before): 56 | continue 57 | 58 | last_url = None 59 | for ascii_domain_match in valid_ascii_domain.finditer(domain): 60 | ascii_domain = ascii_domain_match.group(0) 61 | ascii_start_position = ascii_domain_match.start() 62 | ascii_end_position = ascii_domain_match.end() 63 | last_url = { 64 | 'url': ascii_domain, 65 | 'indices': [start_position + ascii_start_position, start_position + ascii_end_position] 66 | } 67 | urls.append(last_url) 68 | 69 | # no ASCII-only domain found. Skip the entire URL. 70 | if last_url is None: 71 | continue 72 | 73 | # lastUrl only contains domain. Need to add path and query if they exist. 74 | if path: 75 | last_url['url'] = url.replace(domain, last_url['url']) 76 | last_url['indices'][1] = end_position 77 | else: 78 | # In the case of t.co URLs, don't allow additional path characters. 79 | tco_url_match = valid_tco_url.search(url) 80 | 81 | if tco_url_match: 82 | tco_url_slug = tco_url_match.group(1) 83 | if tco_url_slug and len(tco_url_slug) > max_tco_slug_length: 84 | continue 85 | else: 86 | url = tco_url_match.group(0) 87 | end_position = start_position + len(url) 88 | 89 | urls.append({ 90 | 'url': url, 91 | 'indices': [start_position, end_position] 92 | }) 93 | 94 | return urls 95 | 96 | 97 | def is_valid_url(url: str, protocol: str, domain: str) -> bool: 98 | puny_encoded_domain = idna_to_ascii(domain) 99 | 100 | if (not puny_encoded_domain) or len(puny_encoded_domain) == 0: 101 | return False 102 | else: 103 | url_length = len(url) + len(puny_encoded_domain) - len(domain) 104 | return len(protocol) + url_length <= max_url_length 105 | 106 | 107 | def idna_to_ascii(domain: str) -> Optional[str]: 108 | """ 109 | Convert an Internationalized Domain Name (IDN) into a Punycode string. 110 | Return `None` if the `domain` is invalid. 111 | 112 | >>> idna_to_ascii('日本語.jp') 113 | 'xn--wgv71a119e.jp' 114 | """ 115 | try: 116 | return domain.encode('idna').decode('ascii') 117 | except Exception: 118 | return None 119 | -------------------------------------------------------------------------------- /twitter_text/regexp/emoji.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections import namedtuple 3 | from typing import List, Tuple 4 | 5 | import pkg_resources 6 | 7 | Emoji = namedtuple('Emoji', ('group', 'sub_group', 'name', 'status', 'codepoint', 'emoji')) 8 | 9 | 10 | def parse_emoji_list(text: str) -> List[Emoji]: 11 | emoji_entries = [] 12 | 13 | for line in text.splitlines()[32:]: # skip the explanation lines 14 | if line == '# Status Counts': # the last line in the document 15 | break 16 | if 'subtotal:' in line: # these are lines showing statistics about each group, not needed 17 | continue 18 | if not line: # if it's a blank line 19 | continue 20 | if line.startswith('#'): # these lines contain group and/or sub-group names 21 | if '# group:' in line: 22 | group = line.split(':')[-1].strip() 23 | if '# subgroup:' in line: 24 | subgroup = line.split(':')[-1].strip() 25 | if group == 'Component': # skin tones, and hair types, skip, as mentioned above 26 | continue 27 | if re.search('^[0-9A-F]{3,}', line): # if the line starts with a hexadecimal number (an emoji code point) 28 | # here we define all the elements that will go into emoji entries 29 | codepoint = line.split(';')[0].strip() # in some cases it is one and in others multiple code points 30 | status = line.split(';')[-1].split()[0].strip() # status: fully-qualified, minimally-qualified, unqualified 31 | if line[-1] == '#': 32 | # The special case where the emoji is actually the hash sign "#". In this case manually assign the emoji 33 | if 'fully-qualified' in line: 34 | emoji = '#️⃣' 35 | else: 36 | emoji = '#⃣' # they look the same, but are actually different 37 | else: # the default case 38 | emoji = line.split('#')[-1].split()[0].strip() # the emoji character itself 39 | if line[-1] == '#': # (the special case) 40 | name = '#' 41 | else: # extract the emoji name 42 | name = '_'.join(line.split('#')[-1][1:].split()[1:]).replace('_', ' ') 43 | templine = Emoji( 44 | codepoint=codepoint, 45 | status=status, 46 | emoji=emoji, 47 | name=name, 48 | group=group, 49 | sub_group=subgroup) 50 | emoji_entries.append(templine) 51 | 52 | return emoji_entries 53 | 54 | 55 | def regex_for_multi_codepoint_emojis(emoji_list: List[Emoji]) -> str: 56 | multi_codepoint_emoji = [] 57 | 58 | for code in [c.codepoint.split() for c in emoji_list]: 59 | if len(code) > 1: 60 | # turn to a hexadecimal number zfilled to 8 zeros e.g: '\U0001F44D' 61 | hexified_codes = [r'\U' + x.zfill(8) for x in code] 62 | hexified_codes = ''.join(hexified_codes) # join all hexadecimal components 63 | multi_codepoint_emoji.append(hexified_codes) 64 | 65 | # sorting by length in decreasing order is extremely important 66 | multi_codepoint_emoji_sorted = sorted(multi_codepoint_emoji, key=len, reverse=True) 67 | 68 | # join with a "|" to function as an "or" in the regex 69 | multi_codepoint_emoji_joined = '|'.join(multi_codepoint_emoji_sorted) 70 | 71 | return multi_codepoint_emoji_joined 72 | 73 | 74 | def regex_for_single_codepoint_emojis(emoji_list: List[Emoji]) -> str: 75 | single_codepoint_emoji_raw = r'' # start with an empty raw string 76 | for code in single_codepoint_emoji_ranges: 77 | if code[0] == code[1]: # in this case make it a single hexadecimal character 78 | temp_regex = r'\U' + hex(code[0])[2:].zfill(8) 79 | single_codepoint_emoji_raw += temp_regex 80 | else: 81 | # otherwise create a character range, joined by '-' 82 | temp_regex = '-'.join([r'\U' + hex(code[0])[2:].zfill(8), r'\U' + hex(code[1])[2:].zfill(8)]) 83 | single_codepoint_emoji_raw += temp_regex 84 | 85 | 86 | def get_ranges(nums: List[int]) -> List[Tuple[int, int]]: 87 | """Reduce a list of integers to tuples of local maximums and minimums. 88 | 89 | :param nums: List of integers. 90 | :return ranges: List of tuples showing local minimums and maximums 91 | """ 92 | nums = sorted(nums) 93 | lows = [nums[0]] 94 | highs = [] 95 | if nums[1] - nums[0] > 1: 96 | highs.append(nums[0]) 97 | for i in range(1, len(nums) - 1): 98 | if (nums[i] - nums[i - 1]) > 1: 99 | lows.append(nums[i]) 100 | if (nums[i + 1] - nums[i]) > 1: 101 | highs.append(nums[i]) 102 | highs.append(nums[-1]) 103 | if len(highs) > len(lows): 104 | lows.append(highs[-1]) 105 | return [(l, h) for l, h in zip(lows, highs)] 106 | 107 | 108 | emoji_raw = pkg_resources.resource_string(__name__, 'emoji-test.txt').decode('utf-8') 109 | emoji_list = parse_emoji_list(emoji_raw) 110 | emoji_dict = {x.emoji: x for x in emoji_list} 111 | 112 | multi_codepoint_emoji_joined = regex_for_multi_codepoint_emojis(emoji_list) 113 | 114 | single_codepoint_emoji = [] 115 | 116 | for code in [c.codepoint.split() for c in emoji_list]: 117 | if len(code) == 1: 118 | single_codepoint_emoji.append(code[0]) 119 | 120 | single_codepoint_emoji_int = [int(x, base=16) for x in single_codepoint_emoji] 121 | single_codepoint_emoji_ranges = get_ranges(single_codepoint_emoji_int) 122 | 123 | single_codepoint_emoji_raw = r'' # start with an empty raw string 124 | for code in single_codepoint_emoji_ranges: 125 | if code[0] == code[1]: # in this case make it a single hexadecimal character 126 | temp_regex = r'\U' + hex(code[0])[2:].zfill(8) 127 | single_codepoint_emoji_raw += temp_regex 128 | else: 129 | # otherwise create a character range, joined by '-' 130 | temp_regex = '-'.join([r'\U' + hex(code[0])[2:].zfill(8), r'\U' + hex(code[1])[2:].zfill(8)]) 131 | single_codepoint_emoji_raw += temp_regex 132 | 133 | emoji = re.compile(multi_codepoint_emoji_joined + '|' + r'[' + single_codepoint_emoji_raw + r']') 134 | -------------------------------------------------------------------------------- /twitter_text/parse_tweet.py: -------------------------------------------------------------------------------- 1 | import re 2 | import unicodedata 3 | from dataclasses import dataclass, asdict 4 | from math import floor 5 | from typing import List, Dict 6 | 7 | from .config import config 8 | from .extract_emojis import extract_emojis_with_indices 9 | from .extract_urls import extract_urls_with_indices 10 | from .get_character_weight import get_character_weight 11 | from .has_invalid_characters import has_invalid_characters 12 | 13 | 14 | @dataclass(frozen=True) 15 | class ParsedResult: 16 | valid: bool 17 | weightedLength: int 18 | permillage: int 19 | validRangeStart: int 20 | validRangeEnd: int 21 | displayRangeStart: int 22 | displayRangeEnd: int 23 | 24 | def asdict(self) -> dict: 25 | return asdict(self) 26 | 27 | 28 | def convert_line_ending(string, to="\n"): 29 | return re.sub(r'\r\n|\r|\n', to, string) 30 | 31 | 32 | def parse_tweet(text: str, options: dict = config['defaults']) -> ParsedResult: 33 | """ 34 | Parse a Twitter text according to https://developer.twitter.com/en/docs/developer-utilities/twitter-text 35 | 36 | :param str text: A text to parse. 37 | :param dict options: Parameters for counting the weighted tweet length. This must have the following properties: 38 | 39 | max_weighted_tweet_length (int) 40 | Valid tweet messages must not exceed this weighted length. 41 | 42 | default_weight (int) 43 | Default weight to cover code points not defined in the ``ranges``. 44 | 45 | ranges (list of dict) 46 | A list of Unicode code point ranges, with a weight associated with each of these ranges. 47 | Each element of ``ranges`` must have the following attributes: 48 | 49 | - start (int) 50 | - end (int) 51 | - weight (int) 52 | 53 | scale (int) 54 | The weights are divided by ``scale``. 55 | 56 | emoji_parsing_enabled (bool) 57 | When set to ``True``, it counts an emoji consisting of multiple Unicode code points as a single character, 58 | resulting in a visually intuitive weighted length. 59 | 60 | transformed_url_length (int) 61 | The default length assigned to all URLs. 62 | 63 | :return ParsedResult: An object having the following properties: 64 | 65 | weightedLength (int) 66 | The weighted length of the twitter text. 67 | 68 | Each Unicode character (or URL, emoji) in ``text`` is assigned an integer weight, 69 | which is summed over to calculate `weightedLength`. 70 | 71 | valid (bool) 72 | True if the ``text`` is valid, i.e., 73 | 74 | - ``weightedLength <= max_weighted_tweet_length`` 75 | - ``text`` does not contain invalid characters. 76 | 77 | permillage (int) 78 | Equals to ``weightedLength // max_weighted_tweet_length * 1000``. 79 | 80 | displayRangeStart (int) 81 | Always 0. 82 | 83 | displayRangeEnd (int) 84 | Number of UTF-16 code units in ``text``, subtracted by one. 85 | 86 | validRangeStart (int) 87 | Always 0. 88 | 89 | validRangeEnd (int) 90 | Number of UTF-16 code units in the valid part of ``text``, subtracted by one. 91 | 92 | The "valid part" here means the longest valid Unicode substring starting from the leftmost of ``text``. 93 | 94 | 95 | Example: 96 | 97 | >>> parse_tweet('english text 日本語 😷 https://example.com') 98 | ParsedResult( 99 | weightedLength=46, 100 | valid=True, 101 | permillage=164, 102 | validRangeStart=0, 103 | validRangeEnd=38, 104 | displayRangeStart=0, 105 | displayRangeEnd=38 106 | ) 107 | """ 108 | scale = options['scale'] 109 | transformed_url_length = options['transformed_url_length'] 110 | default_weight = options['default_weight'] 111 | emoji_parsing_enabled = options['emoji_parsing_enabled'] 112 | max_weighted_tweet_length = options['max_weighted_tweet_length'] 113 | 114 | normalized_text = convert_line_ending(unicodedata.normalize('NFC', text)) 115 | 116 | url_entities_map = transform_entities_to_hash(extract_urls_with_indices(normalized_text)) 117 | emoji_entities_map = transform_entities_to_hash(extract_emojis_with_indices(normalized_text)) 118 | 119 | weighted_length = 0 120 | valid_display_index = 0 121 | valid = True 122 | char_index = 0 123 | 124 | while char_index < len(normalized_text): 125 | if char_index in url_entities_map: 126 | url = url_entities_map[char_index]['url'] 127 | weighted_length += transformed_url_length * scale 128 | char_index += len(url) - 1 129 | elif emoji_parsing_enabled and char_index in emoji_entities_map: 130 | emoji = emoji_entities_map[char_index]['emoji'] 131 | weighted_length += default_weight 132 | char_index += len(emoji) - 1 133 | else: 134 | weighted_length += get_character_weight(normalized_text[char_index], options) 135 | 136 | if valid: 137 | valid = not has_invalid_characters(normalized_text[char_index:char_index + 1]) 138 | 139 | if valid and weighted_length <= max_weighted_tweet_length * scale: 140 | valid_display_index = char_index 141 | 142 | char_index += 1 143 | 144 | weighted_length = int(weighted_length / scale) 145 | valid_display_offset = count_utf16_bytes(normalized_text[:valid_display_index + 1]) - 1 146 | normalization_offset = count_utf16_bytes(text) - count_utf16_bytes(normalized_text) 147 | 148 | return ParsedResult( 149 | weightedLength=weighted_length, 150 | valid=valid and 0 < weighted_length <= max_weighted_tweet_length, 151 | permillage=floor((weighted_length / max_weighted_tweet_length) * 1000), 152 | validRangeStart=0, 153 | validRangeEnd=valid_display_offset + normalization_offset, 154 | displayRangeStart=0, 155 | displayRangeEnd=count_utf16_bytes(text) - 1 if count_utf16_bytes(text) > 0 else 0 156 | ) 157 | 158 | 159 | def transform_entities_to_hash(entities: List[dict]) -> Dict[int, dict]: 160 | return {entity['indices'][0]: entity for entity in entities} 161 | 162 | 163 | def count_utf16_bytes(text: str) -> int: 164 | return len(text.encode('utf-16')) // 2 - 1 165 | -------------------------------------------------------------------------------- /twitter_text/regexp/valid_gtld.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | valid_gtld = re.compile( 4 | '(?:(?:' + 5 | '삼성|닷컴|닷넷|香格里拉|餐厅|食品|飞利浦|電訊盈科|集团|通販|购物|谷歌|诺基亚|联通|网络|网站|网店|网址|组织机构|移动|珠宝|点看|游戏|淡马锡|机构|書籍|时尚|新闻|' + 6 | '政府|政务|招聘|手表|手机|我爱你|慈善|微博|广东|工行|家電|娱乐|天主教|大拿|大众汽车|在线|嘉里大酒店|嘉里|商标|商店|商城|公益|公司|八卦|健康|信息|佛山|企业|' + 7 | '中文网|中信|世界|ポイント|ファッション|セール|ストア|コム|グーグル|クラウド|みんな|คอม|संगठन|नेट|कॉम|همراه|موقع|موبايلي|كوم|' + 8 | 'كاثوليك|عرب|شبكة|بيتك|بازار|العليان|ارامكو|اتصالات|ابوظبي|קום|сайт|рус|орг|онлайн|москва|ком|' + 9 | 'католик|дети|zuerich|zone|zippo|zip|zero|zara|zappos|yun|youtube|you|yokohama|yoga|yodobashi|' + 10 | 'yandex|yamaxun|yahoo|yachts|xyz|xxx|xperia|xin|xihuan|xfinity|xerox|xbox|wtf|wtc|wow|world|' + 11 | 'works|work|woodside|wolterskluwer|wme|winners|wine|windows|win|williamhill|wiki|wien|whoswho|' + 12 | 'weir|weibo|wedding|wed|website|weber|webcam|weatherchannel|weather|watches|watch|warman|' + 13 | 'wanggou|wang|walter|walmart|wales|vuelos|voyage|voto|voting|vote|volvo|volkswagen|vodka|' + 14 | 'vlaanderen|vivo|viva|vistaprint|vista|vision|visa|virgin|vip|vin|villas|viking|vig|video|' + 15 | 'viajes|vet|versicherung|vermögensberatung|vermögensberater|verisign|ventures|vegas|vanguard|' + 16 | 'vana|vacations|ups|uol|uno|university|unicom|uconnect|ubs|ubank|tvs|tushu|tunes|tui|tube|trv|' + 17 | 'trust|travelersinsurance|travelers|travelchannel|travel|training|trading|trade|toys|toyota|' + 18 | 'town|tours|total|toshiba|toray|top|tools|tokyo|today|tmall|tkmaxx|tjx|tjmaxx|tirol|tires|tips|' + 19 | 'tiffany|tienda|tickets|tiaa|theatre|theater|thd|teva|tennis|temasek|telefonica|telecity|tel|' + 20 | 'technology|tech|team|tdk|tci|taxi|tax|tattoo|tatar|tatamotors|target|taobao|talk|taipei|tab|' + 21 | 'systems|symantec|sydney|swiss|swiftcover|swatch|suzuki|surgery|surf|support|supply|supplies|' + 22 | 'sucks|style|study|studio|stream|store|storage|stockholm|stcgroup|stc|statoil|statefarm|' + 23 | 'statebank|starhub|star|staples|stada|srt|srl|spreadbetting|spot|sport|spiegel|space|soy|sony|' + 24 | 'song|solutions|solar|sohu|software|softbank|social|soccer|sncf|smile|smart|sling|skype|sky|' + 25 | 'skin|ski|site|singles|sina|silk|shriram|showtime|show|shouji|shopping|shop|shoes|shiksha|shia|' + 26 | 'shell|shaw|sharp|shangrila|sfr|sexy|sex|sew|seven|ses|services|sener|select|seek|security|' + 27 | 'secure|seat|search|scot|scor|scjohnson|science|schwarz|schule|school|scholarships|schmidt|' + 28 | 'schaeffler|scb|sca|sbs|sbi|saxo|save|sas|sarl|sapo|sap|sanofi|sandvikcoromant|sandvik|samsung|' + 29 | 'samsclub|salon|sale|sakura|safety|safe|saarland|ryukyu|rwe|run|ruhr|rugby|rsvp|room|rogers|' + 30 | 'rodeo|rocks|rocher|rmit|rip|rio|ril|rightathome|ricoh|richardli|rich|rexroth|reviews|review|' + 31 | 'restaurant|rest|republican|report|repair|rentals|rent|ren|reliance|reit|reisen|reise|rehab|' + 32 | 'redumbrella|redstone|red|recipes|realty|realtor|realestate|read|raid|radio|racing|qvc|quest|' + 33 | 'quebec|qpon|pwc|pub|prudential|pru|protection|property|properties|promo|progressive|prof|' + 34 | 'productions|prod|pro|prime|press|praxi|pramerica|post|porn|politie|poker|pohl|pnc|plus|' + 35 | 'plumbing|playstation|play|place|pizza|pioneer|pink|ping|pin|pid|pictures|pictet|pics|piaget|' + 36 | 'physio|photos|photography|photo|phone|philips|phd|pharmacy|pfizer|pet|pccw|pay|passagens|' + 37 | 'party|parts|partners|pars|paris|panerai|panasonic|pamperedchef|page|ovh|ott|otsuka|osaka|' + 38 | 'origins|orientexpress|organic|org|orange|oracle|open|ooo|onyourside|online|onl|ong|one|omega|' + 39 | 'ollo|oldnavy|olayangroup|olayan|okinawa|office|off|observer|obi|nyc|ntt|nrw|nra|nowtv|nowruz|' + 40 | 'now|norton|northwesternmutual|nokia|nissay|nissan|ninja|nikon|nike|nico|nhk|ngo|nfl|nexus|' + 41 | 'nextdirect|next|news|newholland|new|neustar|network|netflix|netbank|net|nec|nba|navy|natura|' + 42 | 'nationwide|name|nagoya|nadex|nab|mutuelle|mutual|museum|mtr|mtpc|mtn|msd|movistar|movie|mov|' + 43 | 'motorcycles|moto|moscow|mortgage|mormon|mopar|montblanc|monster|money|monash|mom|moi|moe|moda|' + 44 | 'mobily|mobile|mobi|mma|mls|mlb|mitsubishi|mit|mint|mini|mil|microsoft|miami|metlife|merckmsd|' + 45 | 'meo|menu|men|memorial|meme|melbourne|meet|media|med|mckinsey|mcdonalds|mcd|mba|mattel|' + 46 | 'maserati|marshalls|marriott|markets|marketing|market|map|mango|management|man|makeup|maison|' + 47 | 'maif|madrid|macys|luxury|luxe|lupin|lundbeck|ltda|ltd|lplfinancial|lpl|love|lotto|lotte|' + 48 | 'london|lol|loft|locus|locker|loans|loan|llp|llc|lixil|living|live|lipsy|link|linde|lincoln|' + 49 | 'limo|limited|lilly|like|lighting|lifestyle|lifeinsurance|life|lidl|liaison|lgbt|lexus|lego|' + 50 | 'legal|lefrak|leclerc|lease|lds|lawyer|law|latrobe|latino|lat|lasalle|lanxess|landrover|land|' + 51 | 'lancome|lancia|lancaster|lamer|lamborghini|ladbrokes|lacaixa|kyoto|kuokgroup|kred|krd|kpn|' + 52 | 'kpmg|kosher|komatsu|koeln|kiwi|kitchen|kindle|kinder|kim|kia|kfh|kerryproperties|' + 53 | 'kerrylogistics|kerryhotels|kddi|kaufen|juniper|juegos|jprs|jpmorgan|joy|jot|joburg|jobs|jnj|' + 54 | 'jmp|jll|jlc|jio|jewelry|jetzt|jeep|jcp|jcb|java|jaguar|iwc|iveco|itv|itau|istanbul|ist|' + 55 | 'ismaili|iselect|irish|ipiranga|investments|intuit|international|intel|int|insure|insurance|' + 56 | 'institute|ink|ing|info|infiniti|industries|inc|immobilien|immo|imdb|imamat|ikano|iinet|ifm|' + 57 | 'ieee|icu|ice|icbc|ibm|hyundai|hyatt|hughes|htc|hsbc|how|house|hotmail|hotels|hoteles|hot|' + 58 | 'hosting|host|hospital|horse|honeywell|honda|homesense|homes|homegoods|homedepot|holiday|' + 59 | 'holdings|hockey|hkt|hiv|hitachi|hisamitsu|hiphop|hgtv|hermes|here|helsinki|help|healthcare|' + 60 | 'health|hdfcbank|hdfc|hbo|haus|hangout|hamburg|hair|guru|guitars|guide|guge|gucci|guardian|' + 61 | 'group|grocery|gripe|green|gratis|graphics|grainger|gov|got|gop|google|goog|goodyear|goodhands|' + 62 | 'goo|golf|goldpoint|gold|godaddy|gmx|gmo|gmbh|gmail|globo|global|gle|glass|glade|giving|gives|' + 63 | 'gifts|gift|ggee|george|genting|gent|gea|gdn|gbiz|gay|garden|gap|games|game|gallup|gallo|' + 64 | 'gallery|gal|fyi|futbol|furniture|fund|fun|fujixerox|fujitsu|ftr|frontier|frontdoor|frogans|' + 65 | 'frl|fresenius|free|fox|foundation|forum|forsale|forex|ford|football|foodnetwork|food|foo|fly|' + 66 | 'flsmidth|flowers|florist|flir|flights|flickr|fitness|fit|fishing|fish|firmdale|firestone|fire|' + 67 | 'financial|finance|final|film|fido|fidelity|fiat|ferrero|ferrari|feedback|fedex|fast|fashion|' + 68 | 'farmers|farm|fans|fan|family|faith|fairwinds|fail|fage|extraspace|express|exposed|expert|' + 69 | 'exchange|everbank|events|eus|eurovision|etisalat|esurance|estate|esq|erni|ericsson|equipment|' + 70 | 'epson|epost|enterprises|engineering|engineer|energy|emerck|email|education|edu|edeka|eco|eat|' + 71 | 'earth|dvr|dvag|durban|dupont|duns|dunlop|duck|dubai|dtv|drive|download|dot|doosan|domains|' + 72 | 'doha|dog|dodge|doctor|docs|dnp|diy|dish|discover|discount|directory|direct|digital|diet|' + 73 | 'diamonds|dhl|dev|design|desi|dentist|dental|democrat|delta|deloitte|dell|delivery|degree|' + 74 | 'deals|dealer|deal|dds|dclk|day|datsun|dating|date|data|dance|dad|dabur|cyou|cymru|cuisinella|' + 75 | 'csc|cruises|cruise|crs|crown|cricket|creditunion|creditcard|credit|cpa|courses|coupons|coupon|' + 76 | 'country|corsica|coop|cool|cookingchannel|cooking|contractors|contact|consulting|construction|' + 77 | 'condos|comsec|computer|compare|company|community|commbank|comcast|com|cologne|college|coffee|' + 78 | 'codes|coach|clubmed|club|cloud|clothing|clinique|clinic|click|cleaning|claims|cityeats|city|' + 79 | 'citic|citi|citadel|cisco|circle|cipriani|church|chrysler|chrome|christmas|chloe|chintai|cheap|' + 80 | 'chat|chase|charity|channel|chanel|cfd|cfa|cern|ceo|center|ceb|cbs|cbre|cbn|cba|catholic|' + 81 | 'catering|cat|casino|cash|caseih|case|casa|cartier|cars|careers|career|care|cards|caravan|car|' + 82 | 'capitalone|capital|capetown|canon|cancerresearch|camp|camera|cam|calvinklein|call|cal|cafe|' + 83 | 'cab|bzh|buzz|buy|business|builders|build|bugatti|budapest|brussels|brother|broker|broadway|' + 84 | 'bridgestone|bradesco|box|boutique|bot|boston|bostik|bosch|boots|booking|book|boo|bond|bom|' + 85 | 'bofa|boehringer|boats|bnpparibas|bnl|bmw|bms|blue|bloomberg|blog|blockbuster|blanco|' + 86 | 'blackfriday|black|biz|bio|bingo|bing|bike|bid|bible|bharti|bet|bestbuy|best|berlin|bentley|' + 87 | 'beer|beauty|beats|bcn|bcg|bbva|bbt|bbc|bayern|bauhaus|basketball|baseball|bargains|barefoot|' + 88 | 'barclays|barclaycard|barcelona|bar|bank|band|bananarepublic|banamex|baidu|baby|azure|axa|aws|' + 89 | 'avianca|autos|auto|author|auspost|audio|audible|audi|auction|attorney|athleta|associates|asia|' + 90 | 'asda|arte|art|arpa|army|archi|aramco|arab|aquarelle|apple|app|apartments|aol|anz|anquan|' + 91 | 'android|analytics|amsterdam|amica|amfam|amex|americanfamily|americanexpress|alstom|alsace|' + 92 | 'ally|allstate|allfinanz|alipay|alibaba|alfaromeo|akdn|airtel|airforce|airbus|aigo|aig|agency|' + 93 | 'agakhan|africa|afl|afamilycompany|aetna|aero|aeg|adult|ads|adac|actor|active|aco|accountants|' + 94 | 'accountant|accenture|academy|abudhabi|abogado|able|abc|abbvie|abbott|abb|abarth|aarp|aaa|' + 95 | 'onion' + 96 | ')(?=[^0-9a-zA-Z@+-]|$))' 97 | ) 98 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "alabaster" 5 | version = "0.7.13" 6 | description = "A configurable sidebar-enabled Sphinx theme" 7 | optional = false 8 | python-versions = ">=3.6" 9 | files = [ 10 | {file = "alabaster-0.7.13-py3-none-any.whl", hash = "sha256:1ee19aca801bbabb5ba3f5f258e4422dfa86f82f3e9cefb0859b283cdd7f62a3"}, 11 | {file = "alabaster-0.7.13.tar.gz", hash = "sha256:a27a4a084d5e690e16e01e03ad2b2e552c61a65469419b907243193de1a84ae2"}, 12 | ] 13 | 14 | [[package]] 15 | name = "babel" 16 | version = "2.12.1" 17 | description = "Internationalization utilities" 18 | optional = false 19 | python-versions = ">=3.7" 20 | files = [ 21 | {file = "Babel-2.12.1-py3-none-any.whl", hash = "sha256:b4246fb7677d3b98f501a39d43396d3cafdc8eadb045f4a31be01863f655c610"}, 22 | {file = "Babel-2.12.1.tar.gz", hash = "sha256:cc2d99999cd01d44420ae725a21c9e3711b3aadc7976d6147f622d8581963455"}, 23 | ] 24 | 25 | [package.dependencies] 26 | pytz = {version = ">=2015.7", markers = "python_version < \"3.9\""} 27 | 28 | [[package]] 29 | name = "certifi" 30 | version = "2023.5.7" 31 | description = "Python package for providing Mozilla's CA Bundle." 32 | optional = false 33 | python-versions = ">=3.6" 34 | files = [ 35 | {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, 36 | {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, 37 | ] 38 | 39 | [[package]] 40 | name = "charset-normalizer" 41 | version = "3.1.0" 42 | description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." 43 | optional = false 44 | python-versions = ">=3.7.0" 45 | files = [ 46 | {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"}, 47 | {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"}, 48 | {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"}, 49 | {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"}, 50 | {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"}, 51 | {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"}, 52 | {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"}, 53 | {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"}, 54 | {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"}, 55 | {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"}, 56 | {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"}, 57 | {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"}, 58 | {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"}, 59 | {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"}, 60 | {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"}, 61 | {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"}, 62 | {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"}, 63 | {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"}, 64 | {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"}, 65 | {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"}, 66 | {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"}, 67 | {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"}, 68 | {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"}, 69 | {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"}, 70 | {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"}, 71 | {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"}, 72 | {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"}, 73 | {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"}, 74 | {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"}, 75 | {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"}, 76 | {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"}, 77 | {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"}, 78 | {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"}, 79 | {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"}, 80 | {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"}, 81 | {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"}, 82 | {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"}, 83 | {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"}, 84 | {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"}, 85 | {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"}, 86 | {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"}, 87 | {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"}, 88 | {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"}, 89 | {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"}, 90 | {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"}, 91 | {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"}, 92 | {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"}, 93 | {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"}, 94 | {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"}, 95 | {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"}, 96 | {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"}, 97 | {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"}, 98 | {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"}, 99 | {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"}, 100 | {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"}, 101 | {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"}, 102 | {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"}, 103 | {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"}, 104 | {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"}, 105 | {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"}, 106 | {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"}, 107 | {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"}, 108 | {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"}, 109 | {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"}, 110 | {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"}, 111 | {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"}, 112 | {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"}, 113 | {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"}, 114 | {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"}, 115 | {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"}, 116 | {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"}, 117 | {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"}, 118 | {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"}, 119 | {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"}, 120 | {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"}, 121 | ] 122 | 123 | [[package]] 124 | name = "colorama" 125 | version = "0.4.6" 126 | description = "Cross-platform colored terminal text." 127 | optional = false 128 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 129 | files = [ 130 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 131 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 132 | ] 133 | 134 | [[package]] 135 | name = "docutils" 136 | version = "0.18.1" 137 | description = "Docutils -- Python Documentation Utilities" 138 | optional = false 139 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 140 | files = [ 141 | {file = "docutils-0.18.1-py2.py3-none-any.whl", hash = "sha256:23010f129180089fbcd3bc08cfefccb3b890b0050e1ca00c867036e9d161b98c"}, 142 | {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"}, 143 | ] 144 | 145 | [[package]] 146 | name = "exceptiongroup" 147 | version = "1.1.1" 148 | description = "Backport of PEP 654 (exception groups)" 149 | optional = false 150 | python-versions = ">=3.7" 151 | files = [ 152 | {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, 153 | {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, 154 | ] 155 | 156 | [package.extras] 157 | test = ["pytest (>=6)"] 158 | 159 | [[package]] 160 | name = "idna" 161 | version = "3.4" 162 | description = "Internationalized Domain Names in Applications (IDNA)" 163 | optional = false 164 | python-versions = ">=3.5" 165 | files = [ 166 | {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, 167 | {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, 168 | ] 169 | 170 | [[package]] 171 | name = "imagesize" 172 | version = "1.4.1" 173 | description = "Getting image size from png/jpeg/jpeg2000/gif file" 174 | optional = false 175 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" 176 | files = [ 177 | {file = "imagesize-1.4.1-py2.py3-none-any.whl", hash = "sha256:0d8d18d08f840c19d0ee7ca1fd82490fdc3729b7ac93f49870406ddde8ef8d8b"}, 178 | {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, 179 | ] 180 | 181 | [[package]] 182 | name = "importlib-metadata" 183 | version = "6.6.0" 184 | description = "Read metadata from Python packages" 185 | optional = false 186 | python-versions = ">=3.7" 187 | files = [ 188 | {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"}, 189 | {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"}, 190 | ] 191 | 192 | [package.dependencies] 193 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 194 | zipp = ">=0.5" 195 | 196 | [package.extras] 197 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 198 | perf = ["ipython"] 199 | testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] 200 | 201 | [[package]] 202 | name = "iniconfig" 203 | version = "2.0.0" 204 | description = "brain-dead simple config-ini parsing" 205 | optional = false 206 | python-versions = ">=3.7" 207 | files = [ 208 | {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, 209 | {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, 210 | ] 211 | 212 | [[package]] 213 | name = "jinja2" 214 | version = "3.1.2" 215 | description = "A very fast and expressive template engine." 216 | optional = false 217 | python-versions = ">=3.7" 218 | files = [ 219 | {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, 220 | {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, 221 | ] 222 | 223 | [package.dependencies] 224 | MarkupSafe = ">=2.0" 225 | 226 | [package.extras] 227 | i18n = ["Babel (>=2.7)"] 228 | 229 | [[package]] 230 | name = "markupsafe" 231 | version = "2.1.2" 232 | description = "Safely add untrusted strings to HTML/XML markup." 233 | optional = false 234 | python-versions = ">=3.7" 235 | files = [ 236 | {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7"}, 237 | {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036"}, 238 | {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1"}, 239 | {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323"}, 240 | {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601"}, 241 | {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1"}, 242 | {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff"}, 243 | {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65"}, 244 | {file = "MarkupSafe-2.1.2-cp310-cp310-win32.whl", hash = "sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603"}, 245 | {file = "MarkupSafe-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156"}, 246 | {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013"}, 247 | {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a"}, 248 | {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd"}, 249 | {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6"}, 250 | {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d"}, 251 | {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1"}, 252 | {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc"}, 253 | {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0"}, 254 | {file = "MarkupSafe-2.1.2-cp311-cp311-win32.whl", hash = "sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625"}, 255 | {file = "MarkupSafe-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3"}, 256 | {file = "MarkupSafe-2.1.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a"}, 257 | {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a"}, 258 | {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a"}, 259 | {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2"}, 260 | {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619"}, 261 | {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513"}, 262 | {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460"}, 263 | {file = "MarkupSafe-2.1.2-cp37-cp37m-win32.whl", hash = "sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859"}, 264 | {file = "MarkupSafe-2.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666"}, 265 | {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed"}, 266 | {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094"}, 267 | {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54"}, 268 | {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419"}, 269 | {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa"}, 270 | {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58"}, 271 | {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba"}, 272 | {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03"}, 273 | {file = "MarkupSafe-2.1.2-cp38-cp38-win32.whl", hash = "sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2"}, 274 | {file = "MarkupSafe-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147"}, 275 | {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f"}, 276 | {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd"}, 277 | {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f"}, 278 | {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4"}, 279 | {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2"}, 280 | {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65"}, 281 | {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c"}, 282 | {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3"}, 283 | {file = "MarkupSafe-2.1.2-cp39-cp39-win32.whl", hash = "sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7"}, 284 | {file = "MarkupSafe-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed"}, 285 | {file = "MarkupSafe-2.1.2.tar.gz", hash = "sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d"}, 286 | ] 287 | 288 | [[package]] 289 | name = "packaging" 290 | version = "23.1" 291 | description = "Core utilities for Python packages" 292 | optional = false 293 | python-versions = ">=3.7" 294 | files = [ 295 | {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, 296 | {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, 297 | ] 298 | 299 | [[package]] 300 | name = "pluggy" 301 | version = "1.0.0" 302 | description = "plugin and hook calling mechanisms for python" 303 | optional = false 304 | python-versions = ">=3.6" 305 | files = [ 306 | {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, 307 | {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, 308 | ] 309 | 310 | [package.dependencies] 311 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 312 | 313 | [package.extras] 314 | dev = ["pre-commit", "tox"] 315 | testing = ["pytest", "pytest-benchmark"] 316 | 317 | [[package]] 318 | name = "pygments" 319 | version = "2.15.1" 320 | description = "Pygments is a syntax highlighting package written in Python." 321 | optional = false 322 | python-versions = ">=3.7" 323 | files = [ 324 | {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, 325 | {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, 326 | ] 327 | 328 | [package.extras] 329 | plugins = ["importlib-metadata"] 330 | 331 | [[package]] 332 | name = "pytest" 333 | version = "7.3.1" 334 | description = "pytest: simple powerful testing with Python" 335 | optional = false 336 | python-versions = ">=3.7" 337 | files = [ 338 | {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"}, 339 | {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"}, 340 | ] 341 | 342 | [package.dependencies] 343 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 344 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 345 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 346 | iniconfig = "*" 347 | packaging = "*" 348 | pluggy = ">=0.12,<2.0" 349 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 350 | 351 | [package.extras] 352 | testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] 353 | 354 | [[package]] 355 | name = "pytz" 356 | version = "2023.3" 357 | description = "World timezone definitions, modern and historical" 358 | optional = false 359 | python-versions = "*" 360 | files = [ 361 | {file = "pytz-2023.3-py2.py3-none-any.whl", hash = "sha256:a151b3abb88eda1d4e34a9814df37de2a80e301e68ba0fd856fb9b46bfbbbffb"}, 362 | {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"}, 363 | ] 364 | 365 | [[package]] 366 | name = "pyyaml" 367 | version = "6.0" 368 | description = "YAML parser and emitter for Python" 369 | optional = false 370 | python-versions = ">=3.6" 371 | files = [ 372 | {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, 373 | {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, 374 | {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, 375 | {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, 376 | {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, 377 | {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, 378 | {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, 379 | {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, 380 | {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, 381 | {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, 382 | {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, 383 | {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, 384 | {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, 385 | {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, 386 | {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, 387 | {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, 388 | {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, 389 | {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, 390 | {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, 391 | {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, 392 | {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, 393 | {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, 394 | {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, 395 | {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, 396 | {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, 397 | {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, 398 | {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, 399 | {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, 400 | {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, 401 | {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, 402 | {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, 403 | {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, 404 | {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, 405 | {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, 406 | {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, 407 | {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, 408 | {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, 409 | {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, 410 | {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, 411 | {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, 412 | ] 413 | 414 | [[package]] 415 | name = "requests" 416 | version = "2.31.0" 417 | description = "Python HTTP for Humans." 418 | optional = false 419 | python-versions = ">=3.7" 420 | files = [ 421 | {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, 422 | {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, 423 | ] 424 | 425 | [package.dependencies] 426 | certifi = ">=2017.4.17" 427 | charset-normalizer = ">=2,<4" 428 | idna = ">=2.5,<4" 429 | urllib3 = ">=1.21.1,<3" 430 | 431 | [package.extras] 432 | socks = ["PySocks (>=1.5.6,!=1.5.7)"] 433 | use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] 434 | 435 | [[package]] 436 | name = "snowballstemmer" 437 | version = "2.2.0" 438 | description = "This package provides 29 stemmers for 28 languages generated from Snowball algorithms." 439 | optional = false 440 | python-versions = "*" 441 | files = [ 442 | {file = "snowballstemmer-2.2.0-py2.py3-none-any.whl", hash = "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a"}, 443 | {file = "snowballstemmer-2.2.0.tar.gz", hash = "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1"}, 444 | ] 445 | 446 | [[package]] 447 | name = "sphinx" 448 | version = "6.2.1" 449 | description = "Python documentation generator" 450 | optional = false 451 | python-versions = ">=3.8" 452 | files = [ 453 | {file = "Sphinx-6.2.1.tar.gz", hash = "sha256:6d56a34697bb749ffa0152feafc4b19836c755d90a7c59b72bc7dfd371b9cc6b"}, 454 | {file = "sphinx-6.2.1-py3-none-any.whl", hash = "sha256:97787ff1fa3256a3eef9eda523a63dbf299f7b47e053cfcf684a1c2a8380c912"}, 455 | ] 456 | 457 | [package.dependencies] 458 | alabaster = ">=0.7,<0.8" 459 | babel = ">=2.9" 460 | colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} 461 | docutils = ">=0.18.1,<0.20" 462 | imagesize = ">=1.3" 463 | importlib-metadata = {version = ">=4.8", markers = "python_version < \"3.10\""} 464 | Jinja2 = ">=3.0" 465 | packaging = ">=21.0" 466 | Pygments = ">=2.13" 467 | requests = ">=2.25.0" 468 | snowballstemmer = ">=2.0" 469 | sphinxcontrib-applehelp = "*" 470 | sphinxcontrib-devhelp = "*" 471 | sphinxcontrib-htmlhelp = ">=2.0.0" 472 | sphinxcontrib-jsmath = "*" 473 | sphinxcontrib-qthelp = "*" 474 | sphinxcontrib-serializinghtml = ">=1.1.5" 475 | 476 | [package.extras] 477 | docs = ["sphinxcontrib-websupport"] 478 | lint = ["docutils-stubs", "flake8 (>=3.5.0)", "flake8-simplify", "isort", "mypy (>=0.990)", "ruff", "sphinx-lint", "types-requests"] 479 | test = ["cython", "filelock", "html5lib", "pytest (>=4.6)"] 480 | 481 | [[package]] 482 | name = "sphinx-rtd-theme" 483 | version = "1.2.1" 484 | description = "Read the Docs theme for Sphinx" 485 | optional = false 486 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" 487 | files = [ 488 | {file = "sphinx_rtd_theme-1.2.1-py2.py3-none-any.whl", hash = "sha256:2cc9351176cbf91944ce44cefd4fab6c3b76ac53aa9e15d6db45a3229ad7f866"}, 489 | {file = "sphinx_rtd_theme-1.2.1.tar.gz", hash = "sha256:cf9a7dc0352cf179c538891cb28d6fad6391117d4e21c891776ab41dd6c8ff70"}, 490 | ] 491 | 492 | [package.dependencies] 493 | docutils = "<0.19" 494 | sphinx = ">=1.6,<7" 495 | sphinxcontrib-jquery = {version = ">=2.0.0,<3.0.0 || >3.0.0", markers = "python_version > \"3\""} 496 | 497 | [package.extras] 498 | dev = ["bump2version", "sphinxcontrib-httpdomain", "transifex-client", "wheel"] 499 | 500 | [[package]] 501 | name = "sphinxcontrib-applehelp" 502 | version = "1.0.4" 503 | description = "sphinxcontrib-applehelp is a Sphinx extension which outputs Apple help books" 504 | optional = false 505 | python-versions = ">=3.8" 506 | files = [ 507 | {file = "sphinxcontrib-applehelp-1.0.4.tar.gz", hash = "sha256:828f867945bbe39817c210a1abfd1bc4895c8b73fcaade56d45357a348a07d7e"}, 508 | {file = "sphinxcontrib_applehelp-1.0.4-py3-none-any.whl", hash = "sha256:29d341f67fb0f6f586b23ad80e072c8e6ad0b48417db2bde114a4c9746feb228"}, 509 | ] 510 | 511 | [package.extras] 512 | lint = ["docutils-stubs", "flake8", "mypy"] 513 | test = ["pytest"] 514 | 515 | [[package]] 516 | name = "sphinxcontrib-devhelp" 517 | version = "1.0.2" 518 | description = "sphinxcontrib-devhelp is a sphinx extension which outputs Devhelp document." 519 | optional = false 520 | python-versions = ">=3.5" 521 | files = [ 522 | {file = "sphinxcontrib-devhelp-1.0.2.tar.gz", hash = "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4"}, 523 | {file = "sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl", hash = "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e"}, 524 | ] 525 | 526 | [package.extras] 527 | lint = ["docutils-stubs", "flake8", "mypy"] 528 | test = ["pytest"] 529 | 530 | [[package]] 531 | name = "sphinxcontrib-htmlhelp" 532 | version = "2.0.1" 533 | description = "sphinxcontrib-htmlhelp is a sphinx extension which renders HTML help files" 534 | optional = false 535 | python-versions = ">=3.8" 536 | files = [ 537 | {file = "sphinxcontrib-htmlhelp-2.0.1.tar.gz", hash = "sha256:0cbdd302815330058422b98a113195c9249825d681e18f11e8b1f78a2f11efff"}, 538 | {file = "sphinxcontrib_htmlhelp-2.0.1-py3-none-any.whl", hash = "sha256:c38cb46dccf316c79de6e5515e1770414b797162b23cd3d06e67020e1d2a6903"}, 539 | ] 540 | 541 | [package.extras] 542 | lint = ["docutils-stubs", "flake8", "mypy"] 543 | test = ["html5lib", "pytest"] 544 | 545 | [[package]] 546 | name = "sphinxcontrib-jquery" 547 | version = "4.1" 548 | description = "Extension to include jQuery on newer Sphinx releases" 549 | optional = false 550 | python-versions = ">=2.7" 551 | files = [ 552 | {file = "sphinxcontrib-jquery-4.1.tar.gz", hash = "sha256:1620739f04e36a2c779f1a131a2dfd49b2fd07351bf1968ced074365933abc7a"}, 553 | {file = "sphinxcontrib_jquery-4.1-py2.py3-none-any.whl", hash = "sha256:f936030d7d0147dd026a4f2b5a57343d233f1fc7b363f68b3d4f1cb0993878ae"}, 554 | ] 555 | 556 | [package.dependencies] 557 | Sphinx = ">=1.8" 558 | 559 | [[package]] 560 | name = "sphinxcontrib-jsmath" 561 | version = "1.0.1" 562 | description = "A sphinx extension which renders display math in HTML via JavaScript" 563 | optional = false 564 | python-versions = ">=3.5" 565 | files = [ 566 | {file = "sphinxcontrib-jsmath-1.0.1.tar.gz", hash = "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8"}, 567 | {file = "sphinxcontrib_jsmath-1.0.1-py2.py3-none-any.whl", hash = "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178"}, 568 | ] 569 | 570 | [package.extras] 571 | test = ["flake8", "mypy", "pytest"] 572 | 573 | [[package]] 574 | name = "sphinxcontrib-qthelp" 575 | version = "1.0.3" 576 | description = "sphinxcontrib-qthelp is a sphinx extension which outputs QtHelp document." 577 | optional = false 578 | python-versions = ">=3.5" 579 | files = [ 580 | {file = "sphinxcontrib-qthelp-1.0.3.tar.gz", hash = "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72"}, 581 | {file = "sphinxcontrib_qthelp-1.0.3-py2.py3-none-any.whl", hash = "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6"}, 582 | ] 583 | 584 | [package.extras] 585 | lint = ["docutils-stubs", "flake8", "mypy"] 586 | test = ["pytest"] 587 | 588 | [[package]] 589 | name = "sphinxcontrib-serializinghtml" 590 | version = "1.1.5" 591 | description = "sphinxcontrib-serializinghtml is a sphinx extension which outputs \"serialized\" HTML files (json and pickle)." 592 | optional = false 593 | python-versions = ">=3.5" 594 | files = [ 595 | {file = "sphinxcontrib-serializinghtml-1.1.5.tar.gz", hash = "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952"}, 596 | {file = "sphinxcontrib_serializinghtml-1.1.5-py2.py3-none-any.whl", hash = "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd"}, 597 | ] 598 | 599 | [package.extras] 600 | lint = ["docutils-stubs", "flake8", "mypy"] 601 | test = ["pytest"] 602 | 603 | [[package]] 604 | name = "tomli" 605 | version = "2.0.1" 606 | description = "A lil' TOML parser" 607 | optional = false 608 | python-versions = ">=3.7" 609 | files = [ 610 | {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 611 | {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 612 | ] 613 | 614 | [[package]] 615 | name = "typing-extensions" 616 | version = "4.6.1" 617 | description = "Backported and Experimental Type Hints for Python 3.7+" 618 | optional = false 619 | python-versions = ">=3.7" 620 | files = [ 621 | {file = "typing_extensions-4.6.1-py3-none-any.whl", hash = "sha256:6bac751f4789b135c43228e72de18637e9a6c29d12777023a703fd1a6858469f"}, 622 | {file = "typing_extensions-4.6.1.tar.gz", hash = "sha256:558bc0c4145f01e6405f4a5fdbd82050bd221b119f4bf72a961a1cfd471349d6"}, 623 | ] 624 | 625 | [[package]] 626 | name = "urllib3" 627 | version = "2.0.2" 628 | description = "HTTP library with thread-safe connection pooling, file post, and more." 629 | optional = false 630 | python-versions = ">=3.7" 631 | files = [ 632 | {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"}, 633 | {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"}, 634 | ] 635 | 636 | [package.extras] 637 | brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] 638 | secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] 639 | socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] 640 | zstd = ["zstandard (>=0.18.0)"] 641 | 642 | [[package]] 643 | name = "zipp" 644 | version = "3.15.0" 645 | description = "Backport of pathlib-compatible object wrapper for zip files" 646 | optional = false 647 | python-versions = ">=3.7" 648 | files = [ 649 | {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"}, 650 | {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"}, 651 | ] 652 | 653 | [package.extras] 654 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 655 | testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] 656 | 657 | [metadata] 658 | lock-version = "2.0" 659 | python-versions = "^3.7" 660 | content-hash = "82060e113dafb63ffa0ca0a9ea5b6070c9771ceb1dd3727757730f14aaaab96b" 661 | -------------------------------------------------------------------------------- /tests/cases/extract.yml: -------------------------------------------------------------------------------- 1 | tests: 2 | mentions: 3 | - description: "Extract mention at the begining of a tweet" 4 | text: "@username reply" 5 | expected: ["username"] 6 | 7 | - description: "Extract mention at the end of a tweet" 8 | text: "mention @username" 9 | expected: ["username"] 10 | 11 | - description: "Extract mention in the middle of a tweet" 12 | text: "mention @username in the middle" 13 | expected: ["username"] 14 | 15 | - description: "Extract mention of username with underscore" 16 | text: "mention @user_name" 17 | expected: ["user_name"] 18 | 19 | - description: "Extract mention of all numeric username" 20 | text: "mention @12345" 21 | expected: ["12345"] 22 | 23 | - description: "Extract mention or multiple usernames" 24 | text: "mention @username1 @username2" 25 | expected: ["username1", "username2"] 26 | 27 | - description: "Extract mention in the middle of a Japanese tweet" 28 | text: "の@usernameに到着を待っている" 29 | expected: ["username"] 30 | 31 | - description: "DO NOT extract username ending in @" 32 | text: "Current Status: @_@ (cc: @username)" 33 | expected: ["username"] 34 | 35 | - description: "DO NOT extract username followed by accented latin characters" 36 | text: "@aliceìnheiro something something" 37 | expected: [] 38 | 39 | - description: "Extract lone metion but not @user@user (too close to an email)" 40 | text: "@username email me @test@example.com" 41 | expected: ["username"] 42 | 43 | - description: "DO NOT extract 'http' in '@http://' as username" 44 | text: "@http://twitter.com" 45 | expected: [] 46 | 47 | - description: "Extract mentions before newline" 48 | text: "@username\n@mention" 49 | expected: ["username", "mention"] 50 | 51 | - description: "Extract mentions after 'RT'" 52 | text: "RT@username RT:@mention RT @test" 53 | expected: ["username", "mention", "test"] 54 | 55 | - description: "Extract mentions after 'rt'" 56 | text: "rt@username rt:@mention rt @test" 57 | expected: ["username", "mention", "test"] 58 | 59 | - description: "Extract mentions after 'Rt'" 60 | text: "Rt@username Rt:@mention Rt @test" 61 | expected: ["username", "mention", "test"] 62 | 63 | - description: "Extract mentions after 'rT'" 64 | text: "rT@username rT:@mention rT @test" 65 | expected: ["username", "mention", "test"] 66 | 67 | - description: "DO NOT extract username preceded by !" 68 | text: "f!@kn" 69 | expected: [] 70 | 71 | - description: "DO NOT extract username preceded by @" 72 | text: "f@@kn" 73 | expected: [] 74 | 75 | - description: "DO NOT extract username preceded by #" 76 | text: "f#@kn" 77 | expected: [] 78 | 79 | - description: "DO NOT extract username preceded by $" 80 | text: "f$@kn" 81 | expected: [] 82 | 83 | - description: "DO NOT extract username preceded by %" 84 | text: "f%@kn" 85 | expected: [] 86 | 87 | - description: "DO NOT extract username preceded by &" 88 | text: "f&@kn" 89 | expected: [] 90 | 91 | - description: "DO NOT extract username preceded by *" 92 | text: "f*@kn" 93 | expected: [] 94 | 95 | mentions_with_indices: 96 | - description: "Extract a mention at the start" 97 | text: "@username yo!" 98 | expected: 99 | - screen_name: "username" 100 | indices: [0, 9] 101 | 102 | - description: "Extract a mention that has the same thing mentioned at the start" 103 | text: "username @username" 104 | expected: 105 | - screen_name: "username" 106 | indices: [9, 18] 107 | 108 | - description: "Extract a mention in the middle of a Japanese tweet" 109 | text: "の@usernameに到着を待っている" 110 | expected: 111 | - screen_name: "username" 112 | indices: [1, 10] 113 | 114 | mentions_or_lists_with_indices: 115 | - description: "Extract a mention" 116 | text: "@username yo!" 117 | expected: 118 | - screen_name: "username" 119 | list_slug: "" 120 | indices: [0, 9] 121 | 122 | - description: "Extract a list" 123 | text: "@username/list-name is a great list!" 124 | expected: 125 | - screen_name: "username" 126 | list_slug: "/list-name" 127 | indices: [0, 19] 128 | 129 | - description: "Extract a mention and list" 130 | text: "Hey @username, check out out @otheruser/list_name-01!" 131 | expected: 132 | - screen_name: "username" 133 | list_slug: "" 134 | indices: [4, 13] 135 | - screen_name: "otheruser" 136 | list_slug: "/list_name-01" 137 | indices: [29, 52] 138 | 139 | - description: "Extract a list in the middle of a Japanese tweet" 140 | text: "の@username/list_name-01に到着を待っている" 141 | expected: 142 | - screen_name: "username" 143 | list_slug: "/list_name-01" 144 | indices: [1, 23] 145 | 146 | - description: "DO NOT extract a list with slug that starts with a number" 147 | text: "@username/7list-name is a great list!" 148 | expected: 149 | - screen_name: "username" 150 | list_slug: "" 151 | indices: [0, 9] 152 | 153 | replies: 154 | - description: "Extract reply at the begining of a tweet" 155 | text: "@username reply" 156 | expected: "username" 157 | 158 | - description: "Extract reply preceded by only a space" 159 | text: " @username reply" 160 | expected: "username" 161 | 162 | - description: "Extract reply preceded by only a full-width space (U+3000)" 163 | text: " @username reply" 164 | expected: "username" 165 | 166 | - description: "DO NOT Extract reply when preceded by text" 167 | text: "a @username mention, not a reply" 168 | expected: 169 | 170 | - description: "DO NOT Extract reply when preceded by ." 171 | text: ".@username mention, not a reply" 172 | expected: 173 | 174 | - description: "DO NOT Extract reply when preceded by /" 175 | text: "/@username mention, not a reply" 176 | expected: 177 | 178 | - description: "DO NOT Extract reply when preceded by _" 179 | text: "_@username mention, not a reply" 180 | expected: 181 | 182 | - description: "DO NOT Extract reply when preceded by -" 183 | text: "-@username mention, not a reply" 184 | expected: 185 | 186 | - description: "DO NOT Extract reply when preceded by +" 187 | text: "+@username mention, not a reply" 188 | expected: 189 | 190 | - description: "DO NOT Extract reply when preceded by #" 191 | text: "#@username mention, not a reply" 192 | expected: 193 | 194 | - description: "DO NOT Extract reply when preceded by !" 195 | text: "!@username mention, not a reply" 196 | expected: 197 | 198 | - description: "DO NOT Extract reply when preceded by @" 199 | text: "@@username mention, not a reply" 200 | expected: 201 | 202 | - description: "DO NOT Extract reply when followed by URL" 203 | text: "@http://twitter.com" 204 | expected: 205 | 206 | urls: 207 | - description: "Extract a lone URL" 208 | text: "http://example.com" 209 | expected: ["http://example.com"] 210 | 211 | - description: "Extract a lone unicode url" 212 | text: "http://ああ.com" 213 | expected: ["http://ああ.com"] 214 | 215 | - description: "Extract a lone unicode url with -" 216 | text: "http://あ-あ.com" 217 | expected: ["http://あ-あ.com"] 218 | 219 | - description: "Extract valid URL: http://google.com" 220 | text: "text http://google.com" 221 | expected: ["http://google.com"] 222 | 223 | - description: "Extract valid URL: http://foobar.com/#" 224 | text: "text http://foobar.com/#" 225 | expected: ["http://foobar.com/#"] 226 | 227 | - description: "Extract valid URL: http://google.com/#foo" 228 | text: "text http://google.com/#foo" 229 | expected: ["http://google.com/#foo"] 230 | 231 | - description: "Extract valid URL: http://google.com/#search?q=iphone%20-filter%3Alinks" 232 | text: "text http://google.com/#search?q=iphone%20-filter%3Alinks" 233 | expected: ["http://google.com/#search?q=iphone%20-filter%3Alinks"] 234 | 235 | - description: "Extract valid URL: http://twitter.com/#search?q=iphone%20-filter%3Alinks" 236 | text: "text http://twitter.com/#search?q=iphone%20-filter%3Alinks" 237 | expected: ["http://twitter.com/#search?q=iphone%20-filter%3Alinks"] 238 | 239 | - description: "Extract valid URL: http://somedomain.com/index.php?path=/abc/def/" 240 | text: "text http://somedomain.com/index.php?path=/abc/def/" 241 | expected: ["http://somedomain.com/index.php?path=/abc/def/"] 242 | 243 | - description: "Extract valid URL: http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html" 244 | text: "text http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html" 245 | expected: ["http://www.boingboing.net/2007/02/14/katamari_damacy_phon.html"] 246 | 247 | - description: "Extract valid URL: http://somehost.com:3000" 248 | text: "text http://somehost.com:3000" 249 | expected: ["http://somehost.com:3000"] 250 | 251 | - description: "Extract valid URL: http://xo.com/~matthew+%ff-x" 252 | text: "text http://xo.com/~matthew+%ff-x" 253 | expected: ["http://xo.com/~matthew+%ff-x"] 254 | 255 | - description: "Extract valid URL: http://xo.com/~matthew+%ff-,.;x" 256 | text: "text http://xo.com/~matthew+%ff-,.;x" 257 | expected: ["http://xo.com/~matthew+%ff-,.;x"] 258 | 259 | - description: "Extract valid URL: http://xo.com/,.;x" 260 | text: "text http://xo.com/,.;x" 261 | expected: ["http://xo.com/,.;x"] 262 | 263 | - description: "Extract valid URL: http://en.wikipedia.org/wiki/Primer_(film)" 264 | text: "text http://en.wikipedia.org/wiki/Primer_(film)" 265 | expected: ["http://en.wikipedia.org/wiki/Primer_(film)"] 266 | 267 | - description: "Extract valid URL: http://www.ams.org/bookstore-getitem/item=mbk-59" 268 | text: "text http://www.ams.org/bookstore-getitem/item=mbk-59" 269 | expected: ["http://www.ams.org/bookstore-getitem/item=mbk-59"] 270 | 271 | - description: "Extract valid URL: http://✪df.ws/ejp" 272 | text: "text http://✪df.ws/ejp" 273 | expected: ["http://✪df.ws/ejp"] 274 | 275 | - description: "Extract valid URL: http://example.com/" 276 | text: "test http://example.comだよね.comtest/hogehoge" 277 | expected: ["http://example.com"] 278 | 279 | - description: "Extract valid URL: http://chilp.it/?77e8fd" 280 | text: "text http://chilp.it/?77e8fd" 281 | expected: ["http://chilp.it/?77e8fd"] 282 | 283 | - description: "Extract valid URL: http://x.com/oneletterdomain" 284 | text: "text http://x.com/oneletterdomain" 285 | expected: ["http://x.com/oneletterdomain"] 286 | 287 | - description: "Extract valid URL: http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx" 288 | text: "text http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx" 289 | expected: ["http://msdn.microsoft.com/ja-jp/library/system.net.httpwebrequest(v=VS.100).aspx"] 290 | 291 | - description: "Extract valid URL with hyphen as query ending char: https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-" 292 | text: "text https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-" 293 | expected: ["https://www.youtube.com/watch?v=LOxOAuDHzaw&list=PLPoq910Q9jXhuH6pit_KwIsck9fEz_9U-"] 294 | 295 | - description: "DO NOT extract invalid URL: http://no-tld" 296 | text: "text http://no-tld" 297 | expected: [] 298 | 299 | - description: "DO NOT extract invalid URL: http://tld-too-short.x" 300 | text: "text http://tld-too-short.x" 301 | expected: [] 302 | 303 | - description: "DO NOT extract invalid URL with invalid preceding character: (http://twitter.com" 304 | text: "(http://twitter.com" 305 | expected: ["http://twitter.com"] 306 | 307 | - description: "Extract a very long hyphenated sub-domain URL (single letter hyphens)" 308 | text: "text http://word-and-a-number-8-ftw.domain.com/" 309 | expected: ["http://word-and-a-number-8-ftw.domain.com/"] 310 | 311 | - description: "DO NOT Extract a hyphenated TLD (even though it's usually a typo)" 312 | text: "text http://domain.com-that-you-should-have-put-a-space-after" 313 | expected: [] 314 | 315 | - description: "Extract URL ending with # value" 316 | text: "text http://foo.com?#foo text" 317 | expected: ["http://foo.com?#foo"] 318 | 319 | - description: "Extract URLs without protocol on (com|org|edu|gov|net) domains" 320 | text: "foo.com foo.net foo.org foo.edu foo.gov" 321 | expected: ["foo.com", "foo.net", "foo.org", "foo.edu", "foo.gov"] 322 | 323 | - description: "Extract URLs without protocol not on (com|org|edu|gov|net) domains" 324 | text: "foo.baz foo.co.jp www.xxxxxxx.baz www.foo.co.uk wwwww.xxxxxxx foo.comm foo.somecom foo.govedu foo.jp" 325 | expected: ["foo.co.jp", "www.foo.co.uk", "foo.jp"] 326 | 327 | - description: "Extract URLs without protocol on ccTLD with slash" 328 | text: "t.co/abcde bit.ly/abcde" 329 | expected: ["t.co/abcde", "bit.ly/abcde"] 330 | 331 | - description: "Extract URLs with protocol on ccTLD domains" 332 | text: "http://foo.jp http://fooooo.jp" 333 | expected: ["http://foo.jp", "http://fooooo.jp"] 334 | 335 | - description: "Extract URLs with a - or + at the end of the path" 336 | text: "Go to http://example.com/a+ or http://example.com/a-" 337 | expected: ["http://example.com/a+", "http://example.com/a-"] 338 | 339 | - description: "Extract URLs with longer paths ending in -" 340 | text: "Go to http://example.com/view/slug-url-?foo=bar" 341 | expected: ["http://example.com/view/slug-url-?foo=bar"] 342 | 343 | - description: "Extract URLs with an en dash in the path" 344 | text: "Go to https://en.m.wikipedia.org/wiki/Hatfield–McCoy_feud please" 345 | expected: ["https://en.m.wikipedia.org/wiki/Hatfield–McCoy_feud"] 346 | 347 | - description: "Extract URLs beginning with a space" 348 | text: "@user Try http:// example.com/path" 349 | expected: ["example.com/path"] 350 | 351 | - description: "Extract long URL without protocol surrounded by CJK characters" 352 | text: "これは日本語です。example.com/path/index.html中国語example.com/path한국" 353 | expected: ["example.com/path/index.html", "example.com/path"] 354 | 355 | - description: "Extract short URL without protocol surrounded by CJK characters" 356 | text: "twitter.comこれは日本語です。example.com中国語t.co/abcde한국twitter.com example2.comテストtwitter.com/abcde" 357 | expected: ["twitter.com", "example.com", "t.co/abcde", "twitter.com", "example2.com", "twitter.com/abcde"] 358 | 359 | - description: "Extract URLs with and without protocol surrounded by CJK characters" 360 | text: "http://twitter.com/これは日本語です。example.com中国語http://t.co/abcde한국twitter.comテストexample2.comテストhttp://twitter.com/abcde" 361 | expected: ["http://twitter.com/", "example.com", "http://t.co/abcde", "twitter.com", "example2.com", "http://twitter.com/abcde"] 362 | 363 | - description: "Extract URLs with protocol and path containing Cyrillic characters" 364 | text: "Go to http://twitter.com/Русские_слова" 365 | expected: ["http://twitter.com/Русские_слова"] 366 | 367 | - description: "Extract non-ASCII host name URLs with protocol, but ignore host names bigger than 63 characters. Also handle exceptions for non-ASCII hostnames longer than 256 characters" 368 | text: "http://exampleこれは日本語です.com/path/index.html http://あああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああああ.com/path/index.html" 369 | expected: ["http://exampleこれは日本語です.com/path/index.html"] 370 | 371 | - description: "Extract short URLs without protocol on ccTLD domains without path" 372 | text: "twitter.jp日本語it.so中国語foo.jp it.so foo.jp" 373 | expected: ["twitter.jp", "it.so", "foo.jp", "it.so", "foo.jp"] 374 | 375 | - description: "DO NOT extract invalid URL" 376 | text: "Hello http://xn--はじめよう.com/index.html" 377 | expected: [] 378 | 379 | - description: "DO NOT Extract URL with domain preceeded by underscore: http://domain-begin_dash_2314352345_dfasd.foo-cow_4352.com" 380 | text: "text http://domain-dash_2314352345_dfasd.foo-cow_4352.com" 381 | expected: [] 382 | 383 | - description: "DO NOT Extract URLs with a - or + in the middle of an email address" 384 | text: "Email me at name.al-lastname@foo.com or name.al+lastname@foo.com" 385 | expected: [] 386 | 387 | - description: "Extract URLs with a - in the middle" 388 | text: "Find my page at name.al-lastname.com" 389 | expected: ["name.al-lastname.com"] 390 | 391 | - description: "Extract some (tv|co) short URLs without protocol on ccTLD domains without path" 392 | text: "MLB.tv vine.co twitch.tv t.co" 393 | expected: ["MLB.tv", "vine.co", "twitch.tv", "t.co"] 394 | 395 | - description: "Extract URLs beginning with a non-breaking space (U+00A0)" 396 | text: "@user Try http:// example.com/path" 397 | expected: ["example.com/path"] 398 | 399 | - description: "Extract URLs with underscores and dashes in the subdomain" 400 | text: "test http://sub_domain-dash.twitter.com" 401 | expected: ["http://sub_domain-dash.twitter.com"] 402 | 403 | - description: "Extract URL with minimum number of valid characters" 404 | text: "test http://a.b.cd" 405 | expected: ["http://a.b.cd"] 406 | 407 | - description: "Extract URLs containing underscores and dashes" 408 | text: "test http://a_b.c-d.com" 409 | expected: ["http://a_b.c-d.com"] 410 | 411 | - description: "Extract URLs containing dashes in the subdomain" 412 | text: "test http://a-b.c.com" 413 | expected: ["http://a-b.c.com"] 414 | 415 | - description: "Extract URLs with dashes in the domain name" 416 | text: "test http://twitter-dash.com" 417 | expected: ["http://twitter-dash.com"] 418 | 419 | - description: "Extract URLs with lots of symbols then a period" 420 | text: "http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188" 421 | expected: ["http://www.bestbuy.com/site/Currie+Technologies+-+Ezip+400+Scooter/9885188.p?id=1218189013070&skuId=9885188"] 422 | 423 | - description: "DO NOT extract URLs containing leading dashes in the subdomain" 424 | text: "test http://-leadingdash.twitter.com" 425 | expected: [] 426 | 427 | - description: "DO NOT extract URLs containing leading dashes in the domain with a subdomain" 428 | text: "test http://leadingdash.-twitter.com" 429 | expected: [] 430 | 431 | - description: "DO NOT extract URLs containing trailing dashes in the subdomain" 432 | text: "test http://trailingdash-.twitter.com" 433 | expected: [] 434 | 435 | - description: "DO NOT extract URLs containing trailing dashes in the domain with a subdomain" 436 | text: "test http://trailingdash.twitter-.com" 437 | expected: [] 438 | 439 | - description: "DO NOT extract URLs containing leading underscores in the subdomain" 440 | text: "test http://_leadingunderscore.twitter.com" 441 | expected: [] 442 | 443 | - description: "DO NOT extract URLs containing leading underscores in the domain with a subdomain" 444 | text: "test http://leadingunderscore._twitter.com" 445 | expected: [] 446 | 447 | - description: "DO NOT extract URLs containing trailing underscores in the subdomain" 448 | text: "test http://trailingunderscore_.twitter.com" 449 | expected: [] 450 | 451 | - description: "DO NOT extract URLs containing trailing underscores in the domain with a subdomain" 452 | text: "test http://trailingunderscore.twitter_.com" 453 | expected: [] 454 | 455 | - description: "DO NOT extract URLs containing leading dashes in the domain name" 456 | text: "test http://-twitter.com" 457 | expected: [] 458 | 459 | - description: "DO NOT extract URLs containing trailing dashes in the domain name" 460 | text: "test http://twitter-.com" 461 | expected: [] 462 | 463 | - description: "DO NOT extract URLs containing underscores in the domain name" 464 | text: "test http://twitter_underscore.com" 465 | expected: [] 466 | 467 | - description: "DO NOT extract URLs containing underscores in the tld" 468 | text: "test http://twitter.c_o_m" 469 | expected: [] 470 | 471 | - description: "Extract valid URL http://www.foo.com/foo/path-with-period./" 472 | text: "test http://www.foo.com/foo/path-with-period./" 473 | expected: ["http://www.foo.com/foo/path-with-period./"] 474 | 475 | - description: "Extract valid URL http://www.foo.org.za/foo/bar/688.1" 476 | text: "test http://www.foo.org.za/foo/bar/688.1" 477 | expected: ["http://www.foo.org.za/foo/bar/688.1"] 478 | 479 | - description: "Extract valid URL http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0" 480 | text: "test http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0" 481 | expected: ["http://www.foo.com/bar-path/some.stm?param1=foo;param2=P1|0||P2|0"] 482 | 483 | - description: "Extract valid URL http://foo.com/bar/123/foo_&_bar/" 484 | text: "test http://foo.com/bar/123/foo_&_bar/" 485 | expected: ["http://foo.com/bar/123/foo_&_bar/"] 486 | 487 | - description: "Extract valid URL http://www.cp.sc.edu/events/65" 488 | text: "test http://www.cp.sc.edu/events/65 test" 489 | expected: ["http://www.cp.sc.edu/events/65"] 490 | 491 | - description: "Extract valid URL http://www.andersondaradio.no.comunidades.net/" 492 | text: "http://www.andersondaradio.no.comunidades.net/ test test" 493 | expected: ["http://www.andersondaradio.no.comunidades.net/"] 494 | 495 | - description: "Extract valid URL ELPAÍS.com" 496 | text: "test ELPAÍS.com" 497 | expected: ["ELPAÍS.com"] 498 | 499 | - description: "DO NOT include period at the end of URL" 500 | text: "test http://twitter.com/." 501 | expected: ["http://twitter.com/"] 502 | 503 | - description: "Extract a URL with '?' in fragment" 504 | text: "http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata" 505 | expected: ["http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"] 506 | 507 | - description: "Extract a URL with '?' in fragment in a text" 508 | text: "text http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata text" 509 | expected: ["http://tn.com.ar/show/00056158/la-estrella-del-certamen-el-turno-de-pamela-anderson?fb_xd_fragment#?=&cb=fe17523f223b7&relation=parent.parent&transport=fragment&type=resize&height=20&ackdata"] 510 | 511 | # A common cause of runaway regex engines. 512 | - description: "Extract a URL with a ton of trailing periods" 513 | text: "Test a ton of periods http://example.com/path.........................................." 514 | expected: ["http://example.com/path"] 515 | 516 | - description: "Extract a URL with a ton of trailing commas" 517 | text: "Test a ton of periods http://example.com/,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,," 518 | expected: ["http://example.com/"] 519 | 520 | - description: "Extract a URL with a ton of trailing '!'" 521 | text: "Test a ton of periods http://example.com/path/!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" 522 | expected: ["http://example.com/path/"] 523 | 524 | - description: "DO NOT extract URLs in hashtag or @mention" 525 | text: "#test.com @test.com #http://test.com @http://test.com #t.co/abcde @t.co/abcde" 526 | expected: [] 527 | 528 | - description: "Extract a t.co URL with a trailing apostrophe" 529 | text: "I really like http://t.co/pbY2NfTZ's website" 530 | expected: ["http://t.co/pbY2NfTZ"] 531 | 532 | - description: "Extract a t.co URL with a trailing hyphen" 533 | text: "Check this site out http://t.co/FNkPfmii- it's great" 534 | expected: ["http://t.co/FNkPfmii"] 535 | 536 | - description: "Extract a t.co URL with a trailing colon" 537 | text: "According to http://t.co/ulYGBYSo: the internet is cool" 538 | expected: ["http://t.co/ulYGBYSo"] 539 | 540 | - description: "Extract a t.co URL with a long path" 541 | text: "I really like http://t.co/abcdefghijklmnopqrstuvwxyz0123456789" 542 | expected: ["http://t.co/abcdefghijklmnopqrstuvwxyz0123456789"] 543 | 544 | - description: "DO NOT extract URLs with > 40 characters in a t.co slug" 545 | text: "I really like http://t.co/abcdefghijklmnopqrstuvwxyz012345678901234" 546 | expected: [] 547 | 548 | - description: "Extract domain followed by Japanese characters" 549 | text: "example.comてすとですtwitter.みんなです" 550 | expected: ["example.com", "twitter.みんな"] 551 | 552 | - description: "Extract URL before newline" 553 | text: "http://twitter.com\nhttp://example.com\nhttp://example.com/path\nexample.com/path\nit.so\nit.so/abcde" 554 | expected: ["http://twitter.com", "http://example.com", "http://example.com/path", "example.com/path", "it.so", "it.so/abcde"] 555 | 556 | - description: "DO NOT extract URL if preceded by $" 557 | text: "$http://twitter.com $twitter.com $http://t.co/abcde $t.co/abcde $t.co $TVI.CA $RBS.CA" 558 | expected: [] 559 | 560 | - description: "DO NOT extract .bz2 file name as URL" 561 | text: "long.test.tar.bz2 test.tar.bz2 tar.bz2" 562 | expected: [] 563 | 564 | - description: "DO NOT extract URL with gTLD followed by @ sign" 565 | text: "john.doe.gov@mail.com" 566 | expected: [] 567 | 568 | - description: "DO NOT extract URL with ccTLD followed by @ sign" 569 | text: "john.doe.jp@mail.com" 570 | expected: [] 571 | 572 | urls_with_indices: 573 | - description: "Extract a URL" 574 | text: "text http://google.com" 575 | expected: 576 | - url: "http://google.com" 577 | indices: [5, 22] 578 | 579 | - description: "Extract a URL from a Japanese tweet" 580 | text: "皆さん見てください! http://google.com" 581 | expected: 582 | - url: "http://google.com" 583 | indices: [11, 28] 584 | 585 | - description: "Extract URLs without protocol on ccTLD with slash" 586 | text: "t.co/abcde bit.ly/abcde" 587 | expected: 588 | - url: "t.co/abcde" 589 | indices: [0, 10] 590 | - url: "bit.ly/abcde" 591 | indices: [11, 23] 592 | 593 | - description: "Extract URLs without protocol surrounded by CJK characters" 594 | text: "twitter.comこれは日本語です。example.com中国語t.co/abcde한국twitter.com example2.comテストtwitter.com/abcde" 595 | expected: 596 | - url: "twitter.com" 597 | indices: [0, 11] 598 | - url: "example.com" 599 | indices: [20, 31] 600 | - url: "t.co/abcde" 601 | indices: [34, 44] 602 | - url: "twitter.com" 603 | indices: [46, 57] 604 | - url: "example2.com" 605 | indices: [58, 70] 606 | - url: "twitter.com/abcde" 607 | indices: [73, 90] 608 | 609 | - description: "Extract URLs with and without protocol surrounded by CJK characters" 610 | text: "http://twitter.com/これは日本語です。example.com中国語http://t.co/abcde한국twitter.comテストexample2.comテストhttp://twitter.com/abcde" 611 | expected: 612 | - url: "http://twitter.com/" 613 | indices: [0, 19] 614 | - url: "example.com" 615 | indices: [28, 39] 616 | - url: "http://t.co/abcde" 617 | indices: [42, 59] 618 | - url: "twitter.com" 619 | indices: [61, 72] 620 | - url: "example2.com" 621 | indices: [75, 87] 622 | - url: "http://twitter.com/abcde" 623 | indices: [90, 114] 624 | 625 | - description: "Extract t.co URLs skipping trailing characters and adjusting indices correctly" 626 | text: "http://t.co/pbY2NfTZ's http://t.co/2vYHpAc5; http://t.co/ulYGBYSo: http://t.co/8MkmHU0k+c http://t.co/TKLp64dY.x http://t.co/8t7G3ddS#a http://t.co/FNkPfmii-" 627 | expected: 628 | - url: "http://t.co/pbY2NfTZ" 629 | indices: [0, 20] 630 | - url: "http://t.co/2vYHpAc5" 631 | indices: [23, 43] 632 | - url: "http://t.co/ulYGBYSo" 633 | indices: [45, 65] 634 | - url: "http://t.co/8MkmHU0k" 635 | indices: [67, 87] 636 | - url: "http://t.co/TKLp64dY" 637 | indices: [90, 110] 638 | - url: "http://t.co/8t7G3ddS" 639 | indices: [113, 133] 640 | - url: "http://t.co/FNkPfmii" 641 | indices: [136, 156] 642 | 643 | - description: "Properly extract URL that contains t.co in referer" 644 | text: "http://www.foo.com?referer=https://t.co/abcde http://t.co/xyzzy" 645 | expected: 646 | - url: "http://www.foo.com?referer=https://t.co/abcde" 647 | indices: [0, 45] 648 | - url: "http://t.co/xyzzy" 649 | indices: [46, 63] 650 | 651 | - description: "Extract correct indices for duplicate instances of the same URL" 652 | text: "http://t.co http://t.co" 653 | expected: 654 | - url: "http://t.co" 655 | indices: [0, 11] 656 | - url: "http://t.co" 657 | indices: [12, 23] 658 | 659 | - description: "Extract I18N URL" 660 | text: "test http://xn--ls8h.XN--ls8h.la/" 661 | expected: 662 | - url: "http://xn--ls8h.XN--ls8h.la/" 663 | indices: [5, 33] 664 | 665 | - description: "Extract URLs with IDN(not encoded)" 666 | text: "test http://foobar.みんな/ http://foobar.中国/ http://foobar.پاکستان/ " 667 | expected: 668 | - url: "http://foobar.みんな/" 669 | indices: [5, 23] 670 | - url: "http://foobar.中国/" 671 | indices: [24, 41] 672 | - url: "http://foobar.پاکستان/" 673 | indices: [42, 64] 674 | 675 | urls_with_directional_markers: 676 | - description: "Extract URLs from RTL text" 677 | text: "\U00002066\U0000202Atest abcdef.com پاکستان http://twitter.com/\U0000202C\U00002069" 678 | expected: 679 | - url: "abcdef.com" 680 | indices: [7, 17] 681 | - url: "http://twitter.com/" 682 | indices: [26, 45] 683 | 684 | - description: "Extract URLs from RTL text with embedded directional marks" 685 | text: "This is a test \U00002066\U0000202Atwitter.com\U0000202C\U00002069 \U00002066\U0000202Ahttp://foobar.پاکستان/\U0000202C\U00002069⁩ قطر فلسطين عمان" 686 | expected: 687 | - url: "twitter.com" 688 | indices: [17, 28] 689 | - url: "http://foobar.پاکستان/" 690 | indices: [33, 55] 691 | 692 | tco_urls_with_params: 693 | - description: "Extract valid URL with params: https://t.co/UqIyJAJTfo?amp=1" 694 | text: "text https://t.co/UqIyJAJTfo?amp=1" 695 | expected: ["https://t.co/UqIyJAJTfo?amp=1"] 696 | 697 | - description: "Extract valid URL with params: https://t.co/UqIyJAJTfo?type=js" 698 | text: "text https://t.co/UqIyJAJTfo?type=js" 699 | expected: ["https://t.co/UqIyJAJTfo?type=js"] 700 | 701 | - description: "Extract valid URL with params: https://t.co/UqIyJAJTfo?ssr=true" 702 | text: "text https://t.co/UqIyJAJTfo?ssr=true" 703 | expected: ["https://t.co/UqIyJAJTfo?ssr=true"] 704 | 705 | - description: "Extract a valid URL with params: https://t.co/asdfdf?a=b#123" 706 | text: "text https://t.co/asdfdf?a=b#123" 707 | expected: ["https://t.co/asdfdf?a=b#123"] 708 | 709 | - description: "Extract a valid URL with params: https://t.co/sadfasdf?a=b&c=d" 710 | text: "text https://t.co/sadfasdf?a=b&c=d" 711 | expected: ["https://t.co/sadfasdf?a=b&c=d"] 712 | 713 | hashtags: 714 | - description: "Extract hashtag after emoji without variant selector (uFE0E or uFE0F)" 715 | text: "a ✌#hashtag here" 716 | expected: ["hashtag"] 717 | 718 | - description: "Extract hashtag after emoji with variant selector FE0E" 719 | text: "a ✌︎#hashtag here" 720 | expected: ["hashtag"] 721 | 722 | - description: "Extract hashtag after emoji with variant selector FE0F" 723 | text: "a ✌️#hashtag here" 724 | expected: ["hashtag"] 725 | 726 | - description: "Extract hashtag after emoji with skin tone without variant selector (FE0E or FE0F)" 727 | text: "a ✌🏿#hashtag here" 728 | expected: ["hashtag"] 729 | 730 | - description: "Extract hashtag after emoji with skin tone with variant selector FE0F" 731 | text: "a ✌🏿️#hashtag here" 732 | expected: ["hashtag"] 733 | 734 | - description: "Extract hashtag after emoji with zero-width-joiner" 735 | text: "a 👨‍👩‍👧#hashtag here" 736 | expected: ["hashtag"] 737 | 738 | - description: "Extract an all-alpha hashtag" 739 | text: "a #hashtag here" 740 | expected: ["hashtag"] 741 | 742 | - description: "Extract a letter-then-number hashtag" 743 | text: "this is #hashtag1" 744 | expected: ["hashtag1"] 745 | 746 | - description: "Extract a number-then-letter hashtag" 747 | text: "#1hashtag is this" 748 | expected: ["1hashtag"] 749 | 750 | - description: "DO NOT Extract an all-numeric hashtag" 751 | text: "On the #16 bus" 752 | expected: [] 753 | 754 | - description: "DO NOT Extract a single numeric hashtag" 755 | text: "#0" 756 | expected: [] 757 | 758 | - description: "Extract hashtag after bracket" 759 | text: "(#hashtag1 )#hashtag2 [#hashtag3 ]#hashtag4 ’#hashtag5’#hashtag6" 760 | expected: ["hashtag1", "hashtag2", "hashtag3", "hashtag4", "hashtag5", "hashtag6"] 761 | 762 | - description: "Extract a hashtag containing ñ" 763 | text: "I'll write more tests #mañana" 764 | expected: ["mañana"] 765 | 766 | - description: "Extract a hashtag containing é" 767 | text: "Working remotely #café" 768 | expected: ["café"] 769 | 770 | - description: "Extract a hashtag containing ü" 771 | text: "Getting my Oktoberfest on #münchen" 772 | expected: ["münchen"] 773 | 774 | - description: "DO NOT Extract a hashtag containing Japanese" 775 | text: "this is not valid: # 会議中 ハッシュ" 776 | expected: [] 777 | 778 | - description: "Extract a hashtag in Korean" 779 | text: "What is #트위터 anyway?" 780 | expected: ["트위터"] 781 | 782 | - description: "Extract a half-width Hangul hashtag" 783 | text: "Just random half-width Hangul #ᆪᆭᄚ" 784 | expected: ["ᆪᆭᄚ"] 785 | 786 | - description: "Extract a hashtag in Russian" 787 | text: "What is #ашок anyway?" 788 | expected: ["ашок"] 789 | 790 | - description: "Extract a starting katakana hashtag" 791 | text: "#カタカナ is a hashtag" 792 | expected: ["カタカナ"] 793 | 794 | - description: "Extract a starting hiragana hashtag" 795 | text: "#ひらがな FTW!" 796 | expected: ["ひらがな"] 797 | 798 | - description: "Extract a starting kanji hashtag" 799 | text: "#漢字 is the future" 800 | expected: ["漢字"] 801 | 802 | - description: "Extract a trailing katakana hashtag" 803 | text: "Hashtag #カタカナ" 804 | expected: ["カタカナ"] 805 | 806 | - description: "Extract a trailing hiragana hashtag" 807 | text: "Japanese hashtags #ひらがな" 808 | expected: ["ひらがな"] 809 | 810 | - description: "Extract a trailing kanji hashtag" 811 | text: "Study time #漢字" 812 | expected: ["漢字"] 813 | 814 | - description: "Extract a central katakana hashtag" 815 | text: "See my #カタカナ hashtag?" 816 | expected: ["カタカナ"] 817 | 818 | - description: "Extract a central hiragana hashtag" 819 | text: "Study #ひらがな for fun and profit" 820 | expected: ["ひらがな"] 821 | 822 | - description: "Extract a central kanji hashtag" 823 | text: "Some say #漢字 is the past. what do they know?" 824 | expected: ["漢字"] 825 | 826 | - description: "Extract a Kanji/Katakana mixed hashtag" 827 | text: "日本語ハッシュタグテスト #日本語ハッシュタグ" 828 | expected: ["日本語ハッシュタグ"] 829 | 830 | - description: "Extract a hashtag after a punctuation" 831 | text: "日本語ハッシュテスト。#日本語ハッシュタグ" 832 | expected: ["日本語ハッシュタグ"] 833 | 834 | - description: "DO NOT include a punctuation in a hashtag" 835 | text: "#日本語ハッシュタグ。" 836 | expected: ["日本語ハッシュタグ"] 837 | 838 | - description: "Extract a full-width Alnum hashtag" 839 | text: "全角英数字ハッシュタグ #hashtag123" 840 | expected: ["hashtag123"] 841 | 842 | - description: "DO NOT extract a hashtag without a preceding space" 843 | text: "日本語ハッシュタグ#日本語ハッシュタグ" 844 | expected: [] 845 | 846 | - description: "Hashtag with chouon" 847 | text: "長音ハッシュタグ。#サッカー" 848 | expected: ["サッカー"] 849 | 850 | - description: "Hashtag with half-width chouon" 851 | text: "長音ハッシュタグ。#サッカー" 852 | expected: ["サッカー"] 853 | 854 | - description: "Hashtag with half-widh voiced sounds marks" 855 | text: "#ハッシュタグ #パピプペポ" 856 | expected: ["ハッシュタグ", "パピプペポ"] 857 | 858 | - description: "Hashtag with half-width # after full-width !" 859 | text: "できましたよー!#日本語ハッシュタグ。" 860 | expected: ["日本語ハッシュタグ"] 861 | 862 | - description: "Hashtag with full-width # after full-width !" 863 | text: "できましたよー!#日本語ハッシュタグ。" 864 | expected: ["日本語ハッシュタグ"] 865 | 866 | - description: "Hashtag with ideographic iteration mark" 867 | text: "#云々 #学問のすゝめ #いすゞ #各〻 #各〃" 868 | expected: ["云々", "学問のすゝめ", "いすゞ", "各〻", "各〃"] 869 | 870 | - description: "Extract hashtag with fullwidth tilde" 871 | text: "#メ~テレ ハッシュタグ内で~が認識されず" 872 | expected: ["メ~テレ"] 873 | 874 | - description: "Extract hashtag with wave dash" 875 | text: "#メ〜テレ ハッシュタグ内で~が認識されず" 876 | expected: ["メ〜テレ"] 877 | 878 | - description: "Hashtags with ş (U+015F)" 879 | text: "Here’s a test tweet for you: #Ateş #qrşt #ştu #ş" 880 | expected: ["Ateş", "qrşt", "ştu", "ş"] 881 | 882 | - description: "Hashtags with İ (U+0130) and ı (U+0131)" 883 | text: "Here’s a test tweet for you: #İn #ın" 884 | expected: ["İn", "ın"] 885 | 886 | - description: "Hashtag before punctuations" 887 | text: "#hashtag: #hashtag; #hashtag, #hashtag. #hashtag! #hashtag?" 888 | expected: ["hashtag", "hashtag", "hashtag", "hashtag", "hashtag", "hashtag"] 889 | 890 | - description: "Hashtag after punctuations" 891 | text: ":#hashtag ;#hashtag ,#hashtag .#hashtag !#hashtag ?#hashtag" 892 | expected: ["hashtag", "hashtag", "hashtag", "hashtag", "hashtag", "hashtag"] 893 | 894 | - description: "Hashtag before newline" 895 | text: "#hashtag\ntest\n#hashtag2\ntest\n#hashtag3\n" 896 | expected: ["hashtag", "hashtag2", "hashtag3"] 897 | 898 | - description: "DO NOT extract hashtag when # is followed by URL" 899 | text: "#http://twitter.com #https://twitter.com" 900 | expected: [] 901 | 902 | - description: "DO NOT extract hashtag if it's a part of URL" 903 | text: "http://twitter.com/#hashtag twitter.com/#hashtag" 904 | expected: [] 905 | 906 | - description: "Extract hashtags with Latin extended characters" 907 | text: "#Azərbaycanca #mûǁae #Čeština #Ċaoiṁín" 908 | expected: ["Azərbaycanca", "mûǁae", "Čeština", "Ċaoiṁín"] 909 | 910 | - description: "Extract Arabic hashtags" 911 | text: "#سیاست #ایران #السياسة #السياح #لغات #اتمی #کنفرانس #العربية #الجزيرة #فارسی" 912 | expected: ["سیاست", "ایران", "السياسة", "السياح", "لغات", "اتمی", "کنفرانس", "العربية", "الجزيرة", "فارسی"] 913 | 914 | - description: "Extract Arabic hashtags with underscore" 915 | text: "#برنامه_نویسی #رییس_جمهور #رئيس_الوزراء, #ثبت_نام. #لس_آنجلس" 916 | expected: ["برنامه_نویسی", "رییس_جمهور", "رئيس_الوزراء", "ثبت_نام", "لس_آنجلس"] 917 | 918 | - description: "Extract Hebrew hashtags" 919 | text: "#עַל־יְדֵי #וכו׳ #מ״כ" 920 | expected: ["עַל־יְדֵי", "וכו׳", "מ״כ"] 921 | 922 | - description: "Extract Thai hashtags" 923 | text: "#ผู้เริ่ม #การเมือง #รายละเอียด #นักท่องเที่ยว #ของขวัญ #สนามบิน #เดินทาง #ประธาน" 924 | expected: ["ผู้เริ่ม", "การเมือง", "รายละเอียด", "นักท่องเที่ยว", "ของขวัญ", "สนามบิน", "เดินทาง", "ประธาน"] 925 | 926 | - description: "Extract Arabic hashtags with Zero-Width Non-Joiner" 927 | text: "#أي‌بي‌إم #می‌خواهم" 928 | expected: ["أي‌بي‌إم", "می‌خواهم"] 929 | 930 | - description: "Extract Amharic hashtag" 931 | text: "የአላህ መልእክተኛ ሰለላሁ ዓለይሂ ወሰለም #ኢትዮሙስሊምስ" 932 | expected: ["ኢትዮሙስሊምስ"] 933 | 934 | - description: "Extract Sinhala hashtag with Zero-Width Joiner (U+200D)" 935 | text: "#ශ්‍රීලංකා" 936 | expected: ["ශ්‍රීලංකා"] 937 | 938 | - description: "Extract Arabic and Persian hashtags with numbers" 939 | text: "#۳۴۵هشتگ #هشتگ۶۷۸ #ســـلام_عليكم_٤٠٦" 940 | expected: ["۳۴۵هشتگ","هشتگ۶۷۸","ســـلام_عليكم_٤٠٦"] 941 | 942 | - description: "Extract Hindi hashtags" 943 | text: "#महात्मा #महात्मा_१२३४ #१२३४ गांधी" 944 | expected: ["महात्मा","महात्मा_१२३४"] 945 | 946 | - description: "Extract Indic script hashtags" 947 | text: "#বাংলা #ગુજરાતી #ಕನ್ನಡ #മലയാളം #ଓଡ଼ିଆ #ਪੰਜਾਬੀ #සිංහල #தமிழ் #తెలుగు" 948 | expected: ["বাংলা","ગુજરાતી","ಕನ್ನಡ","മലയാളം","ଓଡ଼ିଆ","ਪੰਜਾਬੀ","සිංහල","தமிழ்","తెలుగు"] 949 | 950 | - description: "Extract Tibetan hashtags" 951 | text: "#བོད་སྐད་ #བོད་སྐད" 952 | expected: ["བོད་སྐད་","བོད་སྐད"] 953 | 954 | - description: "Extract Khmer, Burmese, Laotian hashtags" 955 | text: "#មហាត្មះគន្ធី #မြင့်မြတ်သော #ຊີວະສາດ" 956 | expected: ["មហាត្មះគន្ធី","မြင့်မြတ်သော","ຊີວະສາດ"] 957 | 958 | - description: "Extract Greek hashtag" 959 | text: "#Μαχάτμα_Γκάντι ήταν Ινδός πολιτικός" 960 | expected: ["Μαχάτμα_Γκάντι"] 961 | 962 | - description: "Extract Armenian and Georgian hashtags" 963 | text: "#Մահաթմա #მაჰათმა" 964 | expected: ["Մահաթմա","მაჰათმა"] 965 | 966 | - description: "Extract hashtag with middle dot" 967 | text: "#il·lusió" 968 | expected: ["il·lusió"] 969 | 970 | - description: "DO NOT extract hashtags without a letter" 971 | text: "#_ #1_2 #122 #〃" 972 | expected: [] 973 | 974 | hashtags_from_astral: 975 | - description: "Extract hashtag with letter from astral plane (U+20021)" 976 | text: "#\U00020021" 977 | expected: ["\U00020021"] 978 | 979 | - description: "Extract hashtag with letter plus marker from astral plane (U+16f04 U+16f51)" 980 | text: "#\U00016f04\U00016f51" 981 | expected: ["\U00016f04\U00016f51"] 982 | 983 | - description: "Extract hashtag with letter plus number from astral plane (U+104a0)" 984 | text: "#\U00000041\U000104a0" 985 | expected: ["A\U000104a0"] 986 | 987 | hashtags_with_indices: 988 | - description: "Extract a hastag at the start" 989 | text: "#hashtag here" 990 | expected: 991 | - hashtag: "hashtag" 992 | indices: [0, 8] 993 | 994 | - description: "Extract a hastag at the end" 995 | text: "test a #hashtag" 996 | expected: 997 | - hashtag: "hashtag" 998 | indices: [7, 15] 999 | 1000 | - description: "Extract a hastag in the middle" 1001 | text: "test a #hashtag in a string" 1002 | expected: 1003 | - hashtag: "hashtag" 1004 | indices: [7, 15] 1005 | 1006 | - description: "Extract only a valid hashtag" 1007 | text: "#123 a #hashtag in a string" 1008 | expected: 1009 | - hashtag: "hashtag" 1010 | indices: [7, 15] 1011 | 1012 | - description: "Extract a hashtag in a string of multi-byte characters" 1013 | text: "会議中 #hashtag 会議中" 1014 | expected: 1015 | - hashtag: "hashtag" 1016 | indices: [4, 12] 1017 | 1018 | - description: "Extract multiple valid hashtags" 1019 | text: "One #two three #four" 1020 | expected: 1021 | - hashtag: "two" 1022 | indices: [4, 8] 1023 | - hashtag: "four" 1024 | indices: [15, 20] 1025 | 1026 | - description: "Extract a non-latin hashtag" 1027 | text: "Hashtags in #русский!" 1028 | expected: 1029 | - hashtag: "русский" 1030 | indices: [12, 20] 1031 | 1032 | - description: "Extract multiple non-latin hashtags" 1033 | text: "Hashtags in #中文, #日本語, #한국말, and #русский! Try it out!" 1034 | expected: 1035 | - hashtag: "中文" 1036 | indices: [12, 15] 1037 | - hashtag: "日本語" 1038 | indices: [17, 21] 1039 | - hashtag: "한국말" 1040 | indices: [23, 27] 1041 | - hashtag: "русский" 1042 | indices: [33, 41] 1043 | 1044 | cashtags: 1045 | - description: "Extract cashtags" 1046 | text: "Example cashtags: $TEST $Stock $symbol" 1047 | expected: ["TEST", "Stock", "symbol"] 1048 | 1049 | - description: "Extract cashtags with . or _" 1050 | text: "Example cashtags: $TEST.T $test.tt $Stock_X $symbol_ab" 1051 | expected: ["TEST.T", "test.tt", "Stock_X", "symbol_ab"] 1052 | 1053 | - description: "Do not extract cashtags if they contain numbers" 1054 | text: "$123 $test123 $TE123ST" 1055 | expected: [] 1056 | 1057 | - description: "Do not extract cashtags with non-ASCII characters" 1058 | text: "$ストック $株" 1059 | expected: [] 1060 | 1061 | - description: "Do not extract cashtags with punctuations" 1062 | text: "$ $. $- $@ $! $() $+" 1063 | expected: [] 1064 | 1065 | - description: "Do not include trailing . or _" 1066 | text: "$TEST. $TEST_" 1067 | expected: ["TEST", "TEST"] 1068 | 1069 | - description: "Do not extract cashtags if there is no space before $" 1070 | text: "$OK$NG$BAD text$NO .$NG $$NG" 1071 | expected: ["OK"] 1072 | 1073 | - description: "Do not extract too long cashtags" 1074 | text: "$CashtagMustBeLessThanSixCharacter" 1075 | expected: [] 1076 | 1077 | cashtags_with_indices: 1078 | - description: "Extract cashtags" 1079 | text: "Example: $TEST $symbol test" 1080 | expected: 1081 | - cashtag: "TEST" 1082 | indices: [9, 14] 1083 | - cashtag: "symbol" 1084 | indices: [15, 22] 1085 | 1086 | - description: "Extract cashtags with . or _" 1087 | text: "Example: $TEST.T test $symbol_ab end" 1088 | expected: 1089 | - cashtag: "TEST.T" 1090 | indices: [9, 16] 1091 | - cashtag: "symbol_ab" 1092 | indices: [22, 32] -------------------------------------------------------------------------------- /tests/cases/validate.yml: -------------------------------------------------------------------------------- 1 | tests: 2 | tweets: 3 | - description: "Valid Tweet: < 20 characters" 4 | text: "I am a Tweet" 5 | expected: true 6 | 7 | - description: "Valid Tweet: 140 characters" 8 | text: "A lie gets halfway around the world before the truth has a chance to get its pants on. Winston Churchill (1874-1965) http://bit.ly/dJpywL" 9 | expected: true 10 | 11 | - description: "Valid Tweet: 140 characters (with accents)" 12 | text: "A lié géts halfway arøünd thé wørld béføré thé truth has a chance tø get its pants øn. Winston Churchill (1874-1965) http://bit.ly/dJpywL" 13 | expected: true 14 | 15 | - description: "Valid Tweet: 140 characters (double byte characters)" 16 | text: "のののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののののの" 17 | expected: true 18 | 19 | - description: "Valid Tweet: 140 characters (double word characters)" 20 | text: "\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431\U0001f431" 21 | expected: true 22 | 23 | - description: "Invalid Tweet: no characters (empty)" 24 | text: "" 25 | expected: false 26 | 27 | - description: "Invalid Tweet: 141 characters" 28 | text: "A lie gets halfway around the world before the truth has a chance to get its pants on. -- Winston Churchill (1874-1965) http://bit.ly/dJpywL" 29 | expected: false 30 | 31 | - description: "Invalid Tweet: 141 characters (due to newline)" 32 | text: "A lie gets halfway around the world before the truth has a chance to get its pants on. \n- Winston Churchill (1874-1965) http://bit.ly/dJpywL" 33 | expected: false 34 | 35 | usernames: 36 | - description: "Valid username: a-z < 20 characters" 37 | text: "@username" 38 | expected: true 39 | 40 | - description: "All numeric username are allowed" 41 | text: "@12345" 42 | expected: true 43 | 44 | - description: "Usernames should allow the _ character" 45 | text: "@example_name" 46 | expected: true 47 | 48 | - description: "Usernames SHOULD NOT allow the - character" 49 | text: "@example-name" 50 | expected: false 51 | 52 | lists: 53 | - description: "Valid list: a-z < 20 characters" 54 | text: "@username/list" 55 | expected: true 56 | 57 | - description: "A username alone SHOULD NOT be considered a valid list" 58 | text: "@username" 59 | expected: false 60 | 61 | - description: "A username followed by a slash SHOULD NOT be considered a valid list" 62 | text: "@username/" 63 | expected: false 64 | 65 | - description: "Validation SHOULD NOT allow leading spaces" 66 | text: " @username/list" 67 | expected: false 68 | 69 | - description: "Validation SHOULD NOT allow trailing spaces" 70 | text: "@username/list " 71 | expected: false 72 | 73 | hashtags: 74 | - description: "Valid hashtag: a-z < 20 characters" 75 | text: "#hashtag" 76 | expected: true 77 | 78 | - description: "Valid hashtag: number followed by letters" 79 | text: "#1st" 80 | expected: true 81 | 82 | - description: "Valid hashtag: letters and numbers mixed" 83 | text: "#that1time" 84 | expected: true 85 | 86 | - description: "Valid hashtag: letter followed by numbers" 87 | text: "#easyas123" 88 | expected: true 89 | 90 | - description: "Invalid hashtag: all numbers" 91 | text: "#12345" 92 | expected: false 93 | 94 | - description: "Valid hashtag: Russian text" 95 | text: "#ашок" 96 | expected: true 97 | 98 | - description: "Valid hashtag: Korean text" 99 | text: "#트위터" 100 | expected: true 101 | 102 | urls: 103 | - description: "Valid url: protocol + domain" 104 | text: "http://example.com" 105 | expected: true 106 | 107 | - description: "Valid url: ssl + domain + path + query" 108 | text: "https://example.com/path/to/resource?search=foo&lang=en" 109 | expected: true 110 | 111 | - description: "Valid url: protocol + domain + path + fragment" 112 | text: "http://twitter.com/#!/twitter" 113 | expected: true 114 | 115 | - description: "Valid url: cased protocol and domain" 116 | text: "HTTPS://www.ExaMPLE.COM/index.html" 117 | expected: true 118 | 119 | - description: "Valid url: port and userinfo" 120 | text: "http://user:PASSW0RD@example.com:8080/login.php" 121 | expected: true 122 | 123 | - description: "Valid url: matrix path parameters" 124 | text: "http://sports.yahoo.com/nfl/news;_ylt=Aom0;ylu=XyZ?slug=ap-superbowlnotebook" 125 | expected: true 126 | 127 | - description: "Valid url: ipv4" 128 | text: "http://192.168.0.1/index.html?src=asdf" 129 | expected: true 130 | 131 | - description: "Valid url: ipv6" 132 | text: "http://[3ffe:1900:4545:3:200:f8ff:fe21:67cf]:80/index.html" 133 | expected: true 134 | 135 | - description: "Valid url: underscore in subdomain" 136 | text: "http://test_underscore.twitter.com" 137 | expected: true 138 | 139 | - description: "Valid url: sub delims and question marks" 140 | text: "http://example.com?foo=$bar.;baz?BAZ&c=d-#top/?stories+" 141 | expected: true 142 | 143 | - description: "Valid unicode url: unicode domain" 144 | text: "http://☃.net/" 145 | expected: true 146 | 147 | - description: "Valid url: Cyrillic characters in path" 148 | text: "http://example.com/Русские_слова" 149 | expected: true 150 | 151 | - description: "Valid url: trailing hyphen" 152 | text: "https://www.youtube.com/playlist?list=PL0ZPu8XSRTB7wZzn0mLHMvyzVFeRxbWn-" 153 | expected: true 154 | 155 | - description: "Invalid url: invalid scheme" 156 | text: "ftp://www.example.com/" 157 | expected: false 158 | 159 | - description: "Invalid url: invalid userinfo characters" 160 | text: "https://user:pass[word]@www.example.com/" 161 | expected: false 162 | 163 | - description: "Invalid url: underscore in domain" 164 | text: "http://domain-dash_2314352345_dfasd.foo-cow_4352.com" 165 | expected: false 166 | 167 | - description: "Invalid url: domain beginning dash" 168 | text: "http://www.-domain4352.com/" 169 | expected: false 170 | 171 | - description: "Invalid url: domain trailing dash" 172 | text: "http://www.domain4352-.com/" 173 | expected: false 174 | 175 | - description: "Invalid url: unicode domain trailing dash" 176 | text: "http://☃-.net/" 177 | expected: false 178 | 179 | - description: "Invalid url: improperly encoded unicode domain" 180 | text: "http://%e2%98%83.net/" 181 | expected: false 182 | 183 | - description: "Invalid url: invalid IP" 184 | text: "http://256.1.2.3/" 185 | expected: false 186 | 187 | - description: "Invalid url: invalid char in path" 188 | text: "http://en.wikipedia.org/wiki/\"#Punctuation" 189 | expected: false 190 | 191 | - description: "Invalid url: trailing space" 192 | text: "http://example.com/#anchor " 193 | expected: false 194 | 195 | - description: "Invalid url: domain has leading hyphen" 196 | text: "http://test.-twitter.com" 197 | expected: false 198 | 199 | urls_without_protocol: 200 | - description: "Valid url without protocol: domain + gTLD" 201 | text: "example.com" 202 | expected: true 203 | 204 | - description: "Valid url without protocol: subdomain + domain + gTLD" 205 | text: "www.example.com" 206 | expected: true 207 | 208 | - description: "Valid url without protocol: domain + ccTLD" 209 | text: "t.co" 210 | expected: true 211 | 212 | - description: "Valid url without protocol: subdomain + domain + ccTLD" 213 | text: "foo.co.jp" 214 | expected: true 215 | 216 | - description: "Valid url without protocol: domain + path + query" 217 | text: "example.com/path/to/resource?search=foo&lang=en" 218 | expected: true 219 | 220 | WeightedTweetsCounterTest: 221 | - description: "Regular Tweet with url" 222 | text: "Hi http://test.co" 223 | expected: 224 | weightedLength: 26 225 | valid: true 226 | permillage: 92 227 | displayRangeStart: 0 228 | displayRangeEnd: 16 229 | validRangeStart: 0 230 | validRangeEnd: 16 231 | 232 | - description: "Just url" 233 | text: "http://test.co" 234 | expected: 235 | weightedLength: 23 236 | valid: true 237 | permillage: 82 238 | displayRangeStart: 0 239 | displayRangeEnd: 13 240 | validRangeStart: 0 241 | validRangeEnd: 13 242 | 243 | - description: "Long tweet, overflow at char index 280" 244 | text: "285 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-" 245 | expected: 246 | weightedLength: 285 247 | valid: false 248 | permillage: 1017 249 | displayRangeStart: 0 250 | displayRangeEnd: 284 251 | validRangeStart: 0 252 | validRangeEnd: 279 253 | 254 | - description: "Long tweet with url in the middle, overflow at char index 284" 255 | text: "285 chars- http://www.twitter.com/jack xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-" 256 | expected: 257 | weightedLength: 299 258 | valid: false 259 | permillage: 1067 260 | displayRangeStart: 0 261 | displayRangeEnd: 302 262 | validRangeStart: 0 263 | validRangeEnd: 283 264 | 265 | - description: "Long tweet with url at the end, overflow at char index 265" 266 | text: "xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx- http://www.twitter.com/jack " 267 | expected: 268 | weightedLength: 289 269 | valid: false 270 | permillage: 1032 271 | displayRangeStart: 0 272 | displayRangeEnd: 292 273 | validRangeStart: 0 274 | validRangeEnd: 264 275 | 276 | - description: "10 url string, no overflow" 277 | text: "https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha " 278 | expected: 279 | weightedLength: 240 280 | valid: true 281 | permillage: 857 282 | displayRangeStart: 0 283 | displayRangeEnd: 299 284 | validRangeStart: 0 285 | validRangeEnd: 299 286 | 287 | - description: "160 CJK char, overflow at char index 140" 288 | text: "故人西辞黄鹤楼,烟花三月下扬州。孤帆远影碧空尽,唯见长江天际流。朱雀桥边野草花,乌衣巷口夕阳斜。旧时王谢堂前燕,飞入寻常百姓家。朝辞白帝彩云间,千里江陵一日还。两岸猿声啼不住,轻舟已过万重山。泪湿罗巾梦不成,夜深前殿按歌声。红颜未老恩先断,斜倚薰笼坐到明。独在异乡为异客,每逢佳节倍思亲。遥知兄弟登高处,遍插茱萸少一人。" 289 | expected: 290 | weightedLength: 320 291 | valid: false 292 | permillage: 1142 293 | displayRangeStart: 0 294 | displayRangeEnd: 159 295 | validRangeStart: 0 296 | validRangeEnd: 139 297 | 298 | - description: "160 emoji char, overflow at char index 140" 299 | text: "😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷" 300 | expected: 301 | weightedLength: 320 302 | valid: false 303 | permillage: 1142 304 | displayRangeStart: 0 305 | displayRangeEnd: 319 306 | validRangeStart: 0 307 | validRangeEnd: 279 308 | 309 | - description: "3 latin char + 160 CJK char, overflow at char index 141" 310 | text: "the故人西辞黄鹤楼,烟花三月下扬州。孤帆远影碧空尽,唯见长江天际流。朱雀桥边野草花,乌衣巷口夕阳斜。旧时王谢堂前燕,飞入寻常百姓家。朝辞白帝彩云间,千里江陵一日还。两岸猿声啼不住,轻舟已过万重山。泪湿罗巾梦不成,夜深前殿按歌声。红颜未老恩先断,斜倚薰笼坐到明。独在异乡为异客,每逢佳节倍思亲。遥知兄弟登高处,遍插茱萸少一人。" 311 | expected: 312 | weightedLength: 323 313 | valid: false 314 | permillage: 1153 315 | displayRangeStart: 0 316 | displayRangeEnd: 162 317 | validRangeStart: 0 318 | validRangeEnd: 140 319 | 320 | - description: "'Á' is normalized into 1 char" 321 | text: "ÁB" 322 | expected: 323 | weightedLength: 2 324 | valid: true 325 | permillage: 7 326 | displayRangeStart: 0 327 | displayRangeEnd: 2 328 | validRangeStart: 0 329 | validRangeEnd: 2 330 | 331 | - description: "שּׁ is normalized into 3 chars" 332 | text: "Aשּׁ" 333 | expected: 334 | weightedLength: 4 335 | valid: true 336 | permillage: 14 337 | displayRangeStart: 0 338 | displayRangeEnd: 1 339 | validRangeStart: 0 340 | validRangeEnd: 1 341 | 342 | - description: "282 chars with a normalized character within valid range but outside 280" 343 | text: "282 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxÁx" 344 | expected: 345 | weightedLength: 281 346 | valid: false 347 | permillage: 1003 348 | displayRangeStart: 0 349 | displayRangeEnd: 281 350 | validRangeStart: 0 351 | validRangeEnd: 280 352 | 353 | - description: "Count a mix of single byte single word, and double word unicode characters" 354 | text: "H🐱☺👨‍👩‍👧‍👦" 355 | expected: 356 | weightedLength: 16 357 | valid: true 358 | permillage: 57 359 | displayRangeStart: 0 360 | displayRangeEnd: 14 361 | validRangeStart: 0 362 | validRangeEnd: 14 363 | 364 | - description: "Count unicode emoji chars inside the basic multilingual plane" 365 | text: "😷👾😡🔥💩" 366 | expected: 367 | weightedLength: 10 368 | valid: true 369 | permillage: 35 370 | displayRangeStart: 0 371 | displayRangeEnd: 9 372 | validRangeStart: 0 373 | validRangeEnd: 9 374 | 375 | - description: "Count unicode emoji chars outside the basic multilingual plane with skin tone modifiers" 376 | text: "🙋🏽👨‍🎤" 377 | expected: 378 | weightedLength: 9 379 | valid: true 380 | permillage: 32 381 | displayRangeStart: 0 382 | displayRangeEnd: 8 383 | validRangeStart: 0 384 | validRangeEnd: 8 385 | 386 | - description: "Handle General Punctuation Characters with visible spaces(u2000-200A)" 387 | text: "This is a tweet with general punctuation characters: \u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u200C\u200D ‐ ‑ ‒ – — ― ‖ ‗ ‘ ’ ‚ ‛ “ ” „ ‟ ′ ″ ‴ ‵ ‶ ‷" 388 | expected: 389 | weightedLength: 112 390 | valid: true 391 | permillage: 400 392 | displayRangeStart: 0 393 | displayRangeEnd: 111 394 | validRangeStart: 0 395 | validRangeEnd: 111 396 | 397 | - description: "Handle long url with invalid domain labels and short url" 398 | text: "Long url with invalid domain labels and a short url: https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://validurl.com" 399 | expected: 400 | weightedLength: 12079 401 | valid: false 402 | permillage: 43139 403 | displayRangeStart: 0 404 | displayRangeEnd: 12075 405 | validRangeStart: 0 406 | validRangeEnd: 279 407 | 408 | - description: "Handle a 64 character domain without protocol" 409 | text: "randomurlrandomurlrandomurlrandomurlrandomurlrandomurlrandomurls.com" 410 | expected: 411 | weightedLength: 68 412 | valid: true 413 | permillage: 242 414 | displayRangeStart: 0 415 | displayRangeEnd: 67 416 | validRangeStart: 0 417 | validRangeEnd: 67 418 | 419 | - description: "Do not allow > 140 CJK characters by virtue of CJK chars greater than 63 punycode encoded chars in the host" 420 | text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいう.com" 421 | expected: 422 | weightedLength: 358 423 | valid: false 424 | permillage: 1278 425 | displayRangeStart: 0 426 | displayRangeEnd: 184 427 | validRangeStart: 0 428 | validRangeEnd: 143 429 | 430 | - description: "Allow > 140 CJK characters by virtue of CJK chars less than 63 punycode encoded chars in the host" 431 | text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあい.com" 432 | expected: 433 | weightedLength: 264 434 | valid: true 435 | permillage: 942 436 | displayRangeStart: 0 437 | displayRangeEnd: 183 438 | validRangeStart: 0 439 | validRangeEnd: 183 440 | 441 | WeightedTweetsWithDiscountedEmojiCounterTest: 442 | - description: "Regular Tweet with url" 443 | text: "Hi http://test.co" 444 | expected: 445 | weightedLength: 26 446 | valid: true 447 | permillage: 92 448 | displayRangeStart: 0 449 | displayRangeEnd: 16 450 | validRangeStart: 0 451 | validRangeEnd: 16 452 | 453 | - description: "Just url" 454 | text: "http://test.co" 455 | expected: 456 | weightedLength: 23 457 | valid: true 458 | permillage: 82 459 | displayRangeStart: 0 460 | displayRangeEnd: 13 461 | validRangeStart: 0 462 | validRangeEnd: 13 463 | 464 | - description: "Long tweet, overflow at char index 280" 465 | text: "285 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-" 466 | expected: 467 | weightedLength: 285 468 | valid: false 469 | permillage: 1017 470 | displayRangeStart: 0 471 | displayRangeEnd: 284 472 | validRangeStart: 0 473 | validRangeEnd: 279 474 | 475 | - description: "Long tweet with url in the middle, overflow at char index 284" 476 | text: "285 chars- http://www.twitter.com/jack xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-" 477 | expected: 478 | weightedLength: 299 479 | valid: false 480 | permillage: 1067 481 | displayRangeStart: 0 482 | displayRangeEnd: 302 483 | validRangeStart: 0 484 | validRangeEnd: 283 485 | 486 | - description: "Long tweet with url at the end, overflow at char index 265" 487 | text: "xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx- http://www.twitter.com/jack " 488 | expected: 489 | weightedLength: 289 490 | valid: false 491 | permillage: 1032 492 | displayRangeStart: 0 493 | displayRangeEnd: 292 494 | validRangeStart: 0 495 | validRangeEnd: 264 496 | 497 | - description: "10 url string, no overflow" 498 | text: "https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha https://www.twitter.com/aloha " 499 | expected: 500 | weightedLength: 240 501 | valid: true 502 | permillage: 857 503 | displayRangeStart: 0 504 | displayRangeEnd: 299 505 | validRangeStart: 0 506 | validRangeEnd: 299 507 | 508 | - description: "160 CJK char, overflow at char index 140" 509 | text: "故人西辞黄鹤楼,烟花三月下扬州。孤帆远影碧空尽,唯见长江天际流。朱雀桥边野草花,乌衣巷口夕阳斜。旧时王谢堂前燕,飞入寻常百姓家。朝辞白帝彩云间,千里江陵一日还。两岸猿声啼不住,轻舟已过万重山。泪湿罗巾梦不成,夜深前殿按歌声。红颜未老恩先断,斜倚薰笼坐到明。独在异乡为异客,每逢佳节倍思亲。遥知兄弟登高处,遍插茱萸少一人。" 510 | expected: 511 | weightedLength: 320 512 | valid: false 513 | permillage: 1142 514 | displayRangeStart: 0 515 | displayRangeEnd: 159 516 | validRangeStart: 0 517 | validRangeEnd: 139 518 | 519 | - description: "160 emoji char, overflow at char index 140" 520 | text: "😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷😷" 521 | expected: 522 | weightedLength: 320 523 | valid: false 524 | permillage: 1142 525 | displayRangeStart: 0 526 | displayRangeEnd: 319 527 | validRangeStart: 0 528 | validRangeEnd: 279 529 | 530 | - description: "3 latin char + 160 CJK char, overflow at char index 141" 531 | text: "the故人西辞黄鹤楼,烟花三月下扬州。孤帆远影碧空尽,唯见长江天际流。朱雀桥边野草花,乌衣巷口夕阳斜。旧时王谢堂前燕,飞入寻常百姓家。朝辞白帝彩云间,千里江陵一日还。两岸猿声啼不住,轻舟已过万重山。泪湿罗巾梦不成,夜深前殿按歌声。红颜未老恩先断,斜倚薰笼坐到明。独在异乡为异客,每逢佳节倍思亲。遥知兄弟登高处,遍插茱萸少一人。" 532 | expected: 533 | weightedLength: 323 534 | valid: false 535 | permillage: 1153 536 | displayRangeStart: 0 537 | displayRangeEnd: 162 538 | validRangeStart: 0 539 | validRangeEnd: 140 540 | 541 | - description: "282 chars with a normalized character within valid range but outside 280" 542 | text: "282 chars-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxxxxxx-xxxxxÁx" 543 | expected: 544 | weightedLength: 281 545 | valid: false 546 | permillage: 1003 547 | displayRangeStart: 0 548 | displayRangeEnd: 281 549 | validRangeStart: 0 550 | validRangeEnd: 280 551 | 552 | - description: "Count a mix of single byte single word, and double word unicode characters" 553 | text: "H🐱☺👨‍👩‍👧‍👦" 554 | expected: 555 | weightedLength: 7 556 | valid: true 557 | permillage: 25 558 | displayRangeStart: 0 559 | displayRangeEnd: 14 560 | validRangeStart: 0 561 | validRangeEnd: 14 562 | 563 | - description: "Count unicode emoji chars inside the basic multilingual plane" 564 | text: "😷👾😡🔥💩" 565 | expected: 566 | weightedLength: 10 567 | valid: true 568 | permillage: 35 569 | displayRangeStart: 0 570 | displayRangeEnd: 9 571 | validRangeStart: 0 572 | validRangeEnd: 9 573 | 574 | - description: "Count unicode emoji chars outside the basic multilingual plane with skin tone modifiers" 575 | text: "🙋🏽👨‍🎤" 576 | expected: 577 | weightedLength: 4 578 | valid: true 579 | permillage: 14 580 | displayRangeStart: 0 581 | displayRangeEnd: 8 582 | validRangeStart: 0 583 | validRangeEnd: 8 584 | 585 | - description: "Handle General Punctuation Characters with visible spaces(u2000-200A), no ZWJ/ZWNJ" 586 | text: "This is a tweet with general punctuation characters: \u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B ‐ ‑ ‒ – — ― ‖ ‗ ‘ ’ ‚ ‛ “ ” „ ‟ ′ ″ ‴ ‵ ‶ ‷" 587 | expected: 588 | weightedLength: 110 589 | valid: true 590 | permillage: 392 591 | displayRangeStart: 0 592 | displayRangeEnd: 109 593 | validRangeStart: 0 594 | validRangeEnd: 109 595 | 596 | - description: "Handle long url with invalid domain labels and short url" 597 | text: "Long url with invalid domain labels and a short url: https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://somesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurlsomesuperlongurl.com/foo https://validurl.com" 598 | expected: 599 | weightedLength: 12079 600 | valid: false 601 | permillage: 43139 602 | displayRangeStart: 0 603 | displayRangeEnd: 12075 604 | validRangeStart: 0 605 | validRangeEnd: 279 606 | 607 | - description: "Handle a 64 character domain without protocol" 608 | text: "randomurlrandomurlrandomurlrandomurlrandomurlrandomurlrandomurls.com" 609 | expected: 610 | weightedLength: 68 611 | valid: true 612 | permillage: 242 613 | displayRangeStart: 0 614 | displayRangeEnd: 67 615 | validRangeStart: 0 616 | validRangeEnd: 67 617 | 618 | - description: "Do not allow > 140 CJK characters by virtue of CJK chars greater than 63 punycode encoded chars in the host" 619 | text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいう.com" 620 | expected: 621 | weightedLength: 358 622 | valid: false 623 | permillage: 1278 624 | displayRangeStart: 0 625 | displayRangeEnd: 184 626 | validRangeStart: 0 627 | validRangeEnd: 143 628 | 629 | - description: "Allow > 140 CJK characters by virtue of CJK chars less than 63 punycode encoded chars in the host" 630 | text: "あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこ http://あいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあいうえおかきくけこあい.com" 631 | expected: 632 | weightedLength: 264 633 | valid: true 634 | permillage: 942 635 | displayRangeStart: 0 636 | displayRangeEnd: 183 637 | validRangeStart: 0 638 | validRangeEnd: 183 639 | 640 | - description: "140 family emoji" 641 | text: "👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦👨‍👩‍👧‍👦" 642 | expected: 643 | weightedLength: 280 644 | valid: true 645 | permillage: 1000 646 | displayRangeStart: 0 647 | displayRangeEnd: 1539 648 | validRangeStart: 0 649 | validRangeEnd: 1539 650 | 651 | - description: "Emoji with a leading character in the latin range is counted as 2" 652 | text: "1⃣" 653 | expected: 654 | weightedLength: 2 655 | valid: true 656 | permillage: 7 657 | displayRangeStart: 0 658 | displayRangeEnd: 1 659 | validRangeStart: 0 660 | validRangeEnd: 1 661 | 662 | - description: "Unicode 10.0 emoji" 663 | text: "Unicode 10.0 emoji: 🤪; 🧕; 🧕🏾; 🏴󠁧󠁢󠁥󠁮󠁧󠁿" 664 | expected: 665 | weightedLength: 34 666 | valid: true 667 | permillage: 121 668 | displayRangeStart: 0 669 | displayRangeEnd: 47 670 | validRangeStart: 0 671 | validRangeEnd: 47 672 | 673 | - description: "Unicode 9.0 emoji" 674 | text: "Unicode 9.0 emoji: 🤠; 💃; 💃🏾" 675 | expected: 676 | weightedLength: 29 677 | valid: true 678 | permillage: 103 679 | displayRangeStart: 0 680 | displayRangeEnd: 30 681 | validRangeStart: 0 682 | validRangeEnd: 30 683 | 684 | UnicodeDirectionalMarkerCounterTest: 685 | - description: "Handle invalid characters" 686 | text: "ABC\u202A\uFFFFABC\uFFFE" 687 | expected: 688 | weightedLength: 12 689 | valid: false 690 | permillage: 42 691 | displayRangeStart: 0 692 | displayRangeEnd: 8 693 | validRangeStart: 0 694 | validRangeEnd: 3 695 | 696 | - description: "Tweet text containing directional characters should be considered valid" 697 | text: "\U00002066\U0000202Ahttp://foobar.پاکستان/\U0000202C\U00002069" 698 | expected: 699 | weightedLength: 31 700 | valid: true 701 | permillage: 110 702 | displayRangeStart: 0 703 | displayRangeEnd: 25 704 | validRangeStart: 0 705 | validRangeEnd: 25 706 | --------------------------------------------------------------------------------