├── .gitignore ├── .gitmodules ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── __init__.py ├── requirements.txt ├── setup.py ├── tests.py └── twitter_text ├── __init__.py ├── autolink.py ├── extractor.py ├── highlighter.py ├── regex.py ├── templatetags ├── __init__.py └── twitterize.py ├── unicode.py └── validation.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | build* 3 | *.egg* 4 | dist -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "twitter-text-conformance"] 2 | path = twitter-text-conformance 3 | url = https://github.com/dryan/twitter-text-conformance.git 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | 6 | install: 7 | - "pip install . --use-mirrors" 8 | - "pip install -r requirements.txt --use-mirrors" 9 | script: "python ./tests.py" 10 | notifications: 11 | email: false 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010, Daniel Ryan 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | * Neither the name of "dryan" nor Daniel Ryan nor the names of its contributors may be used 13 | to endorse or promote products derived from this software without 14 | specific prior written permission. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 20 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 | POSSIBILITY OF SUCH DAMAGE. 27 | 28 | ================== 29 | 30 | This project is modeled after http://github.com/mzsanford/twitter-text-rb under the Apache License, 2.0. 31 | 32 | Copyright 2010 Twitter, Inc. 33 | 34 | Licensed under the Apache License, Version 2.0 (the "License"); you may not 35 | use this file except in compliance with the License. You may obtain a copy of 36 | the License at 37 | 38 | http://www.apache.org/licenses/LICENSE-2.0 39 | 40 | Unless required by applicable law or agreed to in writing, software 41 | distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 42 | WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 43 | License for the specific language governing permissions and limitations under 44 | the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include *.textile 3 | recursive-include twitter_text *.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A port of the Ruby gem [twitter-text-rb](https://github.com/twitter/twitter-text-rb) to Python. 2 | 3 | [![Build Status](https://travis-ci.org/dryan/twitter-text-py.png?branch=master)](https://travis-ci.org/dryan/twitter-text-py) 4 | 5 | # Changes in 2.0 6 | 7 | See the [pull request](https://github.com/dryan/twitter-text-py/pull/17) for details. 8 | 9 | # Usage 10 | 11 | You can either call a new TwitterText object with the text of the tweet you want to process `TwitterText('twitter-text-py is #awesome')` or use any of the submodule objects directly (Autolink, Extractor, HitHighlighter or Validation), passing in the tweet text as an argument. 12 | 13 | The library also contains a Django template filter that applies the auto_link method to the passed in text. It can also optionally apply the hit_highlight method. Example: 14 | 15 | {% load twitterize %} 16 | 17 | {{ obj.body|twitter_text }} 18 | {{ obj.body|twitter_text:"my term" }} 19 | 20 | You can test that the library is working correctly by running `python tests.py` inside the `twitter_text` directory. 21 | 22 | ## TwitterText(text) 23 | 24 | ### Properties: 25 | 26 | * text: the original text you passed in, or the modified version if you've called any functions on the object. 27 | * original_text: the original text you passed in; never modified. Useful for a fallback or to do comparisons. 28 | * has_been_linked: boolean denoting if any of the Autolink functions have been called. (Mostly for internal use.) 29 | * tweet_length: the value returned by `validation.tweet_length` or None if that function has not yet been called. 30 | * tweet_is_valid: boolean returned by `validation.tweet_invalid` or None if that function has not yet been called. 31 | * validation_error: the validation error string returned by `validation.tweet_invalid` or None if that function has not yet been called. 32 | * autolink: property pointing to an Autolink object initialized with `text` 33 | * extractor: property pointing to an Extractor object initialized with `text` 34 | * highlighter: property pointing to a HitHighlighter object initialized with `text` 35 | * validation: property pointing to a Validation object initialized with `text` 36 | 37 | ## Autolink(text) 38 | 39 | This object modifies the text passed to it (and the parent TwitterText.text if present). 40 | 41 | ### Defaults 42 | 43 | These may be overridden by kwargs on a particular method. 44 | 45 | * url_class = 'tweet-url' 46 | * list_class = 'list-slug' 47 | * username_class = 'username' 48 | * hashtag_class = 'hashtag' 49 | 50 | ### Methods: 51 | 52 | __auto_link(self, **kwargs)__ 53 | 54 | Add `` tags around the usernames, lists, hashtags and URLs in the provided text. The `` tags can be controlled with the following kwargs: 55 | 56 | * url_class: class to add to all `` tags 57 | * list_class: class to add to list `` tags 58 | * username_class: class to add to username `` tags 59 | * hashtag_class: class to add to hashtag `` tags 60 | * username_url_base: the value for href attribute on username links. The @username (minus the @) will be appended at the end of this. 61 | * list_url_base: the value for href attribute on list links. The @username/list (minus the @) will be appended at the end of this. 62 | * hashtag_url_base: the value for href attribute on hashtag links. The #hashtag (minus the #) will be appended at the end of this. 63 | * suppress_lists: disable auto-linking to lists 64 | * suppress_no_follow: do not add rel="nofollow" to auto-linked items 65 | * html_attrs: a dictionary of HTML attributes to add to non-Twitter links 66 | 67 | __auto_link_usernames_or_lists(self, **kwargs)__ 68 | 69 | Add `` tags around the usernames and lists in the provided text. The `` tags can be controlled with the following kwargs: 70 | 71 | * url_class: class to add to all `` tags 72 | * list_class: class to add to list `` tags 73 | * username_class: class to add to username `` tags 74 | * username_url_base: the value for href attribute on username links. The @username (minus the @) will be appended at the end of this. 75 | * list_url_base: the value for href attribute on list links. The @username/list (minus the @) will be appended at the end of this. 76 | * suppress_lists: disable auto-linking to lists 77 | * suppress_no_follow: do not add rel="nofollow" to auto-linked items 78 | 79 | __auto_link_hashtags(self, **kwargs)__ 80 | 81 | Add `` tags around the hashtags in the provided text. The `` tags can be controlled with the following kwargs: 82 | 83 | * url_class: class to add to all `` tags 84 | * hashtag_class: class to add to hashtag `` tags 85 | * hashtag_url_base: the value for href attribute. The hashtag text (minus the #) will be appended at the end of this. 86 | * suppress_no_follow: do not add rel="nofollow" to auto-linked items 87 | 88 | __auto_link_urls_custom(self, **kwargs)__ 89 | 90 | Add `` tags around the URLs in the provided text. Any elements in kwargs (except @supress_no_follow@) will be converted to HTML attributes and place in the `` tag. Unless kwargs contains @suppress_no_follow@ the rel="nofollow" attribute will be added. 91 | 92 | ## Extractor 93 | 94 | This object does not modify the text passed to it (or the parent TwitterText.text if present). 95 | 96 | ### Methods 97 | 98 | __extract_mentioned_screen_names__ 99 | 100 | Extracts a list of all usernames mentioned in the Tweet text. If the text contains no username mentions an empty list will be returned. 101 | 102 | If a transform is given, then it will be called with each username. 103 | 104 | __extract_mentioned_screen_names_with_indices__ 105 | 106 | Extracts a list of all usernames mentioned in the Tweet text along with the indices for where the mention occurred in the format: 107 | 108 | { 109 | 'screen_name': username_string, 110 | 'indicies': ( start_postion, end_position ) 111 | } 112 | 113 | If the text contains no username mentions, an empty list will be returned. 114 | 115 | If a transform is given, then it will be called with each username, the start index, and the end index in the text. 116 | 117 | __extract_reply_screen_name__ 118 | 119 | Extracts the first username replied to in the Tweet text. If the text does not contain a reply None will be returned. 120 | 121 | If a transform is given then it will be called with the username replied to (if any). 122 | 123 | __extract_urls__ 124 | 125 | Extracts a list of all URLs included in the Tweet text. If the text contains no URLs an empty list will be returned. 126 | 127 | If a transform is given then it will be called for each URL. 128 | 129 | __extract_urls_with_indices__ 130 | 131 | Extracts a list of all URLs included in the Tweet text along with the indices in the format: 132 | 133 | { 134 | 'url': url_string, 135 | 'indices': ( start_postion, end_position ) 136 | } 137 | 138 | If the text contains no URLs an empty list will be returned. 139 | 140 | If a transform is given then it will be called for each URL, the start index, and the end index in the text. 141 | 142 | __extract_hashtags__ 143 | 144 | Extracts a list of all hashtags included in the Tweet text. If the text contains no hashtags an empty list will be returned. The list returned will not include the leading # character. 145 | 146 | If a transform is given then it will be called for each hashtag. 147 | 148 | __extract_hashtags_with_indices__ 149 | 150 | Extracts a list of all hashtags included in the Tweet text along with the indices in the format: 151 | 152 | { 153 | 'hashtag': hashtag_text, 154 | 'indices': ( start_postion, end_position ) 155 | } 156 | 157 | If the text contains no hashtags an empty list will be returned. The list returned will not include the leading # character. 158 | 159 | If a transform is given then it will be called for each hashtag. 160 | 161 | ## HitHighlighter 162 | 163 | ### Defaults 164 | 165 | These may be overridden by kwargs on a particular method. 166 | 167 | * highlight_tag = 'em' 168 | * highlight_class = 'search-hit' 169 | 170 | ### Methods 171 | 172 | __hit_highlight(self, query, **kwargs)__ 173 | 174 | Add `` tags around occurrences of query provided in the text except for occurrences inside hashtags. 175 | 176 | The `` tags or css class can be overridden using the highlight_tag and/or highlight_class kwarg. For example: 177 | 178 | python> HitHighlighter.hit_highlight('test hit here').hit_highlight('hit', highlight_tag = 'strong', highlight_class = 'search-term') 179 | =\> "test hit here" 180 | 181 | 182 | ## Validation 183 | 184 | ### Methods 185 | 186 | __tweet_length__ 187 | 188 | Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a string no matter which actual form was transmitted. For example: 189 | 190 | U+0065 Latin Small Letter E 191 | + U+0301 Combining Acute Accent 192 | ---------- 193 | = 2 bytes, 2 characters, displayed as é (1 visual glyph) 194 | 195 | The NFC of {U+0065, U+0301} is {U+00E9}, which is a single character and a display length of 1 196 | 197 | The string could also contain U+00E9 already, in which case the canonicalization will not change the value. 198 | 199 | __tweet_invalid__ 200 | 201 | Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation will allow quicker feedback. 202 | 203 | Returns false if this text is valid. Otherwise one of the following Symbols will be returned: 204 | 205 | * "Too long": if the text is too long 206 | * "Empty text": if the text is empty 207 | * "Invalid characters": if the text contains non-Unicode or any of the disallowed Unicode characters 208 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dryan/twitter-text-py/143ee74751597efd782417df4773d586720428a4/__init__.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse==1.2.1 2 | PyYAML==3.10 3 | beautifulsoup4==4.2.0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='twitter-text-py', 5 | version='2.0.2', 6 | description='A library for auto-converting URLs, mentions, hashtags, lists, etc. in Twitter text. Also does tweet validation and search term highlighting.', 7 | author='Daniel Ryan', 8 | author_email='dryan@dryan.com', 9 | url='http://github.com/dryan/twitter-text-py', 10 | packages=find_packages(), 11 | classifiers=[ 12 | 'Development Status :: 5 - Production/Stable', 13 | 'Environment :: Web Environment', 14 | 'Intended Audience :: Developers', 15 | 'License :: OSI Approved :: BSD License', 16 | 'Operating System :: OS Independent', 17 | 'Programming Language :: Python', 18 | 'Framework :: Django', 19 | ], 20 | include_package_data=True, 21 | install_requires=['setuptools'], 22 | license = "BSD" 23 | ) 24 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import twitter_text, sys, os, json, argparse, re 4 | from twitter_text.unicode import force_unicode 5 | 6 | narrow_build = True 7 | try: 8 | unichr(0x20000) 9 | narrow_build = False 10 | except: 11 | pass 12 | 13 | parser = argparse.ArgumentParser(description = u'Run the integration tests for twitter_text') 14 | parser.add_argument('--ignore-narrow-errors', '-i', help = u'Ignore errors caused by narrow builds', default = False, action = 'store_true') 15 | args = parser.parse_args() 16 | 17 | try: 18 | import yaml 19 | except ImportError: 20 | raise Exception('You need to install pyaml to run the tests') 21 | # from http://stackoverflow.com/questions/2890146/how-to-force-pyyaml-to-load-strings-as-unicode-objects 22 | from yaml import Loader, SafeLoader 23 | def construct_yaml_str(self, node): 24 | return self.construct_scalar(node) 25 | Loader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) 26 | SafeLoader.add_constructor(u'tag:yaml.org,2002:str', construct_yaml_str) 27 | 28 | try: 29 | from bs4 import BeautifulSoup 30 | except ImportError: 31 | try: 32 | from BeautifulSoup import BeautifulSoup 33 | except ImportError: 34 | raise Exception('You need to install BeautifulSoup to run the tests') 35 | 36 | def success(text): 37 | return (u'\033[92m%s\033[0m\n' % text).encode('utf-8') 38 | 39 | def error(text): 40 | return (u'\033[91m%s\033[0m\n' % text).encode('utf-8') 41 | 42 | attempted = 0 43 | 44 | def assert_equal_without_attribute_order(result, test, failure_message = None): 45 | global attempted 46 | attempted += 1 47 | # Beautiful Soup sorts the attributes for us so we can skip all the hoops the ruby version jumps through 48 | assert BeautifulSoup(result) == BeautifulSoup(test.get('expected')), error(u'Test %d Failed: %s' % (attempted, test.get('description'))) 49 | sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) 50 | sys.stdout.flush() 51 | 52 | def assert_equal(result, test): 53 | global attempted 54 | attempted += 1 55 | assert result == test.get('expected'), error(u'\nTest %d Failed: %s%s' % (attempted, test.get('description'), u'\n%s' % test.get('hits') if test.get('hits') else '')) 56 | sys.stdout.write(success(u'Test %d Passed: %s' % (attempted, test.get('description')))) 57 | sys.stdout.flush() 58 | 59 | # extractor section 60 | extractor_file = open(os.path.join('twitter-text-conformance', 'extract.yml'), 'r') 61 | extractor_tests = yaml.load(force_unicode(extractor_file.read())) 62 | extractor_file.close() 63 | 64 | sys.stdout.write('Testing Extractor\n') 65 | sys.stdout.flush() 66 | 67 | for section in extractor_tests.get('tests'): 68 | sys.stdout.write('\nTesting Extractor: %s\n' % section) 69 | sys.stdout.flush() 70 | for test in extractor_tests.get('tests').get(section): 71 | if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Hashtag with ideographic iteration mark']: 72 | sys.stdout.write('Skipping: %s\n' % test.get('description')) 73 | sys.stdout.flush() 74 | continue 75 | extractor = twitter_text.extractor.Extractor(test.get('text')) 76 | if section == 'mentions': 77 | assert_equal(extractor.extract_mentioned_screen_names(), test) 78 | elif section == 'mentions_with_indices': 79 | assert_equal(extractor.extract_mentioned_screen_names_with_indices(), test) 80 | elif section == 'mentions_or_lists_with_indices': 81 | assert_equal(extractor.extract_mentions_or_lists_with_indices(), test) 82 | elif section == 'replies': 83 | assert_equal(extractor.extract_reply_screen_name(), test) 84 | elif section == 'urls': 85 | assert_equal(extractor.extract_urls(), test) 86 | elif section == 'urls_with_indices': 87 | assert_equal(extractor.extract_urls_with_indices(), test) 88 | elif section == 'hashtags': 89 | assert_equal(extractor.extract_hashtags(), test) 90 | elif section == 'cashtags': 91 | assert_equal(extractor.extract_cashtags(), test) 92 | elif section == 'hashtags_with_indices': 93 | assert_equal(extractor.extract_hashtags_with_indices(), test) 94 | elif section == 'cashtags_with_indices': 95 | assert_equal(extractor.extract_cashtags_with_indices(), test) 96 | 97 | # autolink section 98 | autolink_file = open(os.path.join('twitter-text-conformance', 'autolink.yml'), 'r') 99 | autolink_tests = yaml.load(force_unicode(autolink_file.read())) 100 | autolink_file.close() 101 | 102 | sys.stdout.write('\nTesting Autolink\n') 103 | sys.stdout.flush() 104 | 105 | autolink_options = {'suppress_no_follow': True} 106 | 107 | for section in autolink_tests.get('tests'): 108 | sys.stdout.write('\nTesting Autolink: %s\n' % section) 109 | for test in autolink_tests.get('tests').get(section): 110 | if (args.ignore_narrow_errors or narrow_build) and section in ['hashtags'] and test.get('description') in ['Autolink a hashtag containing ideographic iteration mark']: 111 | sys.stdout.write('Skipping: %s\n' % test.get('description')) 112 | sys.stdout.flush() 113 | continue 114 | autolink = twitter_text.autolink.Autolink(test.get('text')) 115 | if section == 'usernames': 116 | assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) 117 | elif section == 'cashtags': 118 | assert_equal_without_attribute_order(autolink.auto_link_cashtags(autolink_options), test) 119 | elif section == 'urls': 120 | assert_equal_without_attribute_order(autolink.auto_link_urls(autolink_options), test) 121 | elif section == 'hashtags': 122 | assert_equal_without_attribute_order(autolink.auto_link_hashtags(autolink_options), test) 123 | elif section == 'all': 124 | assert_equal_without_attribute_order(autolink.auto_link(autolink_options), test) 125 | elif section == 'lists': 126 | assert_equal_without_attribute_order(autolink.auto_link_usernames_or_lists(autolink_options), test) 127 | elif section == 'json': 128 | assert_equal_without_attribute_order(autolink.auto_link_with_json(json.loads(test.get('json')), autolink_options), test) 129 | 130 | # hit_highlighting section 131 | hit_highlighting_file = open(os.path.join('twitter-text-conformance', 'hit_highlighting.yml'), 'r') 132 | hit_highlighting_tests = yaml.load(force_unicode(hit_highlighting_file.read())) 133 | hit_highlighting_file.close() 134 | 135 | sys.stdout.write('\nTesting Hit Highlighting\n') 136 | sys.stdout.flush() 137 | 138 | for section in hit_highlighting_tests.get('tests'): 139 | sys.stdout.write('\nTesting Hit Highlighting: %s\n' % section) 140 | for test in hit_highlighting_tests.get('tests').get(section): 141 | hit_highlighter = twitter_text.highlighter.HitHighlighter(test.get('text')) 142 | if section == 'plain_text': 143 | assert_equal(hit_highlighter.hit_highlight(hits = test.get('hits')), test) 144 | elif section == 'with_links': 145 | assert_equal_without_attribute_order(hit_highlighter.hit_highlight(hits = test.get('hits')), test) 146 | 147 | # validation section 148 | validation_tested = False 149 | validate_tests = None 150 | try: 151 | validate_file = open(os.path.join('twitter-text-conformance', 'validate.yml'), 'r') 152 | validate_file_contents = validate_file.read() 153 | validate_tests = yaml.load(re.sub(ur'\\n', '\n', validate_file_contents.encode('unicode-escape'))) 154 | validate_file.close() 155 | except ValueError: 156 | sys.stdout.write('\nValidation tests were skipped because of wide character issues\n') 157 | sys.stdout.flush() 158 | 159 | if validate_tests: 160 | sys.stdout.write('\nTesting Validation\n') 161 | sys.stdout.flush() 162 | 163 | for section in validate_tests.get('tests'): 164 | sys.stdout.write('\nTesting Validation: %s\n' % section) 165 | for test in validate_tests.get('tests').get(section): 166 | validator = twitter_text.validation.Validation(test.get('text')) 167 | if section == 'tweets': 168 | assert_equal(not validator.tweet_invalid(), test) 169 | elif section == 'usernames': 170 | assert_equal(validator.valid_username(), test) 171 | elif section == 'lists': 172 | assert_equal(validator.valid_list(), test) 173 | elif section == 'hashtags': 174 | assert_equal(validator.valid_hashtag(), test) 175 | elif section == 'urls': 176 | assert_equal(validator.valid_url(), test) 177 | 178 | sys.stdout.write(u'\033[0m-------\n\033[92m%d tests passed.\033[0m\n' % attempted) 179 | sys.stdout.flush() 180 | sys.exit(os.EX_OK) -------------------------------------------------------------------------------- /twitter_text/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | from twitter_text.autolink import Autolink 4 | from twitter_text.extractor import Extractor 5 | from twitter_text.highlighter import HitHighlighter 6 | from twitter_text.validation import Validation 7 | from twitter_text.unicode import force_unicode 8 | 9 | class TwitterText(object): 10 | def __init__(self, text): 11 | self.text = force_unicode(text) # this will get modified by some functions 12 | self.original_text = self.text # this never changes; use it as a fallback or for comparison 13 | self.has_been_linked = False 14 | self.tweet_length = None # gets changed by validation method 15 | self.tweet_is_valid = None # gets changed by validation method 16 | self.validation_error = None # gets changed by validation method 17 | 18 | def __unicode__(self): 19 | return self.text 20 | 21 | def __repr__(self): 22 | return self.__unicode__() 23 | 24 | @property 25 | def autolink(self): 26 | return Autolink(self.text, parent = self) 27 | 28 | @property 29 | def extractor(self): 30 | return Extractor(self.text) 31 | 32 | @property 33 | def highlighter(self): 34 | return HitHighlighter(self.text, parent = self) 35 | 36 | @property 37 | def validation(self): 38 | return Validation(self.text, parent = self) -------------------------------------------------------------------------------- /twitter_text/autolink.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import re, cgi 4 | 5 | from twitter_text.regex import REGEXEN 6 | from twitter_text.unicode import force_unicode 7 | from twitter_text.extractor import Extractor 8 | 9 | # Default CSS class for auto-linked lists 10 | DEFAULT_LIST_CLASS = "tweet-url list-slug" 11 | # Default CSS class for auto-linked usernames 12 | DEFAULT_USERNAME_CLASS = "tweet-url username" 13 | # Default CSS class for auto-linked hashtags 14 | DEFAULT_HASHTAG_CLASS = "tweet-url hashtag" 15 | # Default CSS class for auto-linked cashtags 16 | DEFAULT_CASHTAG_CLASS = "tweet-url cashtag" 17 | 18 | # Default URL base for auto-linked usernames 19 | DEFAULT_USERNAME_URL_BASE = "https://twitter.com/" 20 | # Default URL base for auto-linked lists 21 | DEFAULT_LIST_URL_BASE = "https://twitter.com/" 22 | # Default URL base for auto-linked hashtags 23 | DEFAULT_HASHTAG_URL_BASE = "https://twitter.com/#!/search?q=%23" 24 | # Default URL base for auto-linked cashtags 25 | DEFAULT_CASHTAG_URL_BASE = "https://twitter.com/#!/search?q=%24" 26 | 27 | # Default attributes for invisible span tag 28 | DEFAULT_INVISIBLE_TAG_ATTRS = "style='position:absolute;left:-9999px;'" 29 | 30 | DEFAULT_OPTIONS = { 31 | 'list_class': DEFAULT_LIST_CLASS, 32 | 'username_class': DEFAULT_USERNAME_CLASS, 33 | 'hashtag_class': DEFAULT_HASHTAG_CLASS, 34 | 'cashtag_class': DEFAULT_CASHTAG_CLASS, 35 | 36 | 'username_url_base': DEFAULT_USERNAME_URL_BASE, 37 | 'list_url_base': DEFAULT_LIST_URL_BASE, 38 | 'hashtag_url_base': DEFAULT_HASHTAG_URL_BASE, 39 | 'cashtag_url_base': DEFAULT_CASHTAG_URL_BASE, 40 | 41 | 'invisible_tag_attrs': DEFAULT_INVISIBLE_TAG_ATTRS, 42 | } 43 | 44 | OPTIONS_NOT_ATTRIBUTES = ( 45 | 'url_class', 46 | 'list_class', 47 | 'username_class', 48 | 'hashtag_class', 49 | 'cashtag_class', 50 | 'username_url_base', 51 | 'list_url_base', 52 | 'hashtag_url_base', 53 | 'cashtag_url_base', 54 | 'username_url_transform', 55 | 'list_url_transform', 56 | 'hashtag_url_transform', 57 | 'cashtag_url_transform', 58 | 'link_url_transform', 59 | 'username_include_symbol', 60 | 'suppress_lists', 61 | 'suppress_no_follow', 62 | 'url_entities', 63 | 'invisible_tag_attrs', 64 | 'symbol_tag', 65 | 'text_with_symbol_tag', 66 | 'url_target', 67 | 'link_attribute_transform', 68 | 'link_text_transform', 69 | ) 70 | 71 | HTML_ENTITIES = { 72 | '&': '&', 73 | '>': '>', 74 | '<': '<', 75 | '"': '"', 76 | "'": ''', 77 | } 78 | 79 | BOOLEAN_ATTRIBUTES = ( 80 | 'disabled', 81 | 'readonly', 82 | 'multiple', 83 | 'checked', 84 | ) 85 | 86 | def default_transform(entity, text): 87 | return text 88 | 89 | class Autolink(object): 90 | def __init__(self, text, **kwargs): 91 | self.text = force_unicode(text) 92 | self.parent = kwargs.get('parent', False) 93 | self.extractor = Extractor(self.text) 94 | 95 | def auto_link_with_json(self, json_obj, options = {}): 96 | # concantenate entities 97 | entities = [] 98 | if 'entities' in json_obj: 99 | json_obj = json_obj.get('entities') 100 | for key in json_obj: 101 | if type(json_obj[key]) == list: 102 | entities = entities + json_obj[key] 103 | 104 | # map JSON entity to twitter_text entity 105 | for entity in entities: 106 | if 'text' in entity: 107 | entity['hashtag'] = entity.get('text') 108 | 109 | return self.auto_link_entities(entities, options) 110 | 111 | def auto_link_entities(self, entities = [], options = {}): 112 | if not self.text: 113 | return self.text 114 | 115 | # NOTE deprecate these attributes not options keys in options hash, then use html_attrs 116 | options = dict(DEFAULT_OPTIONS.items() + options.items()) 117 | options['html_attrs'] = self._extract_html_attrs_from_options(options) 118 | if not options.get('suppress_no_follow', False): 119 | options['html_attrs']['rel'] = "nofollow" 120 | 121 | entities.sort(key = lambda entity: entity['indices'][0], reverse = True) 122 | chars = self.text 123 | 124 | for entity in entities: 125 | if 'url' in entity: 126 | chars = self._link_to_url(entity, chars, options) 127 | elif 'hashtag' in entity: 128 | chars = self._link_to_hashtag(entity, chars, options) 129 | elif 'screen_name' in entity: 130 | chars = self._link_to_screen_name(entity, chars, options) 131 | elif 'cashtag' in entity: 132 | chars = self._link_to_cashtag(entity, chars, options) 133 | 134 | return chars 135 | 136 | def auto_link(self, options = {}): 137 | """ 138 | Add tags around the usernames, lists, hashtags and URLs in the provided text. 139 | The tags can be controlled with the following entries in the options hash. 140 | Also any elements in the options hash will be converted to HTML attributes 141 | and place in the tag. 142 | 143 | @url_class class to add to url tags 144 | @list_class class to add to list tags 145 | @username_class class to add to username tags 146 | @hashtag_class class to add to hashtag tags 147 | @cashtag_class class to add to cashtag tags 148 | @username_url_base the value for href attribute on username links. The @username (minus the @) will be appended at the end of this. 149 | @list_url_base the value for href attribute on list links. The @username/list (minus the @) will be appended at the end of this. 150 | @hashtag_url_base the value for href attribute on hashtag links. The #hashtag (minus the #) will be appended at the end of this. 151 | @cashtag_url_base the value for href attribute on cashtag links. The $cashtag (minus the $) will be appended at the end of this. 152 | @invisible_tag_attrs HTML attribute to add to invisible span tags 153 | @username_include_symbol place the @ symbol within username and list links 154 | @suppress_lists disable auto-linking to lists 155 | @suppress_no_follow do not add rel="nofollow" to auto-linked items 156 | @symbol_tag tag to apply around symbol (@, #, $) in username / hashtag / cashtag links 157 | @text_with_symbol_tag tag to apply around text part in username / hashtag / cashtag links 158 | @url_target the value for target attribute on URL links. 159 | @link_attribute_transform function to modify the attributes of a link based on the entity. called with |entity, attributes| params, and should modify the attributes hash. 160 | @link_text_transform function to modify the text of a link based on the entity. called with (entity, text) params, and should return a modified text. 161 | """ 162 | return self.auto_link_entities(self.extractor.extract_entities_with_indices({'extract_url_without_protocol': False}), options) 163 | 164 | def auto_link_usernames_or_lists(self, options = {}): 165 | """ 166 | Add tags around the usernames and lists in the provided text. The 167 | tags can be controlled with the following entries in the options hash. 168 | Also any elements in the options hash will be converted to HTML attributes 169 | and place in the tag. 170 | 171 | @list_class class to add to list tags 172 | @username_class class to add to username tags 173 | @username_url_base the value for href attribute on username links. The @username (minus the @) will be appended at the end of this. 174 | @list_url_base the value for href attribute on list links. The @username/list (minus the @) will be appended at the end of this. 175 | @username_include_symbol place the @ symbol within username and list links 176 | @suppress_lists disable auto-linking to lists 177 | @suppress_no_follow do not add rel="nofollow" to auto-linked items 178 | @symbol_tag tag to apply around symbol (@, #, $) in username / hashtag / cashtag links 179 | @text_with_symbol_tag tag to apply around text part in username / hashtag / cashtag links 180 | @link_attribute_transform function to modify the attributes of a link based on the entity. called with (entity, attributes) params, and should modify the attributes hash. 181 | @link_text_transform function to modify the text of a link based on the entity. called with (entity, text) params, and should return a modified text. 182 | """ 183 | return self.auto_link_entities(self.extractor.extract_mentions_or_lists_with_indices(), options) 184 | 185 | def auto_link_hashtags(self, options = {}): 186 | """ 187 | Add tags around the hashtags in the provided text. 188 | The tags can be controlled with the following entries in the options hash. 189 | Also any elements in the options hash will be converted to HTML attributes 190 | and place in the tag. 191 | 192 | @hashtag_class class to add to hashtag tags 193 | @hashtag_url_base the value for href attribute. The hashtag text (minus the #) will be appended at the end of this. 194 | @suppress_no_follow do not add rel="nofollow" to auto-linked items 195 | @symbol_tag tag to apply around symbol (@, #, $) in username / hashtag / cashtag links 196 | @text_with_symbol_tag tag to apply around text part in username / hashtag / cashtag links 197 | @link_attribute_transform function to modify the attributes of a link based on the entity. called with (entity, attributes) params, and should modify the attributes hash. 198 | @link_text_transform function to modify the text of a link based on the entity. called with (entity, text) params, and should return a modified text. 199 | """ 200 | return self.auto_link_entities(self.extractor.extract_hashtags_with_indices(), options) 201 | 202 | def auto_link_cashtags(self, options = {}): 203 | """ 204 | Add tags around the cashtags in the provided text. 205 | The tags can be controlled with the following entries in the options hash. 206 | Also any elements in the options hash will be converted to HTML attributes 207 | and place in the tag. 208 | 209 | @cashtag_class:: class to add to cashtag tags 210 | @cashtag_url_base the value for href attribute. The cashtag text (minus the $) will be appended at the end of this. 211 | @suppress_no_follow do not add rel="nofollow" to auto-linked items 212 | @symbol_tag tag to apply around symbol (@, #, $) in username / hashtag / cashtag links 213 | @text_with_symbol_tag tag to apply around text part in username / hashtag / cashtag links 214 | @link_attribute_transform function to modify the attributes of a link based on the entity. called with (entity, attributes) params, and should modify the attributes hash. 215 | @link_text_transform function to modify the text of a link based on the entity. called with (entity, text) params, and should return a modified text. 216 | """ 217 | return self.auto_link_entities(self.extractor.extract_cashtags_with_indices(), options) 218 | 219 | def auto_link_urls(self, options = {}): 220 | """ 221 | Add tags around the URLs in the provided text. 222 | The tags can be controlled with the following entries in the options hash. 223 | Also any elements in the options hash will be converted to HTML attributes 224 | and place in the tag. 225 | 226 | @url_class class to add to url tags 227 | @invisible_tag_attrs HTML attribute to add to invisible span tags 228 | @suppress_no_follow do not add rel="nofollow" to auto-linked items 229 | @symbol_tag tag to apply around symbol (@, #, $) in username / hashtag / cashtag links 230 | @text_with_symbol_tag tag to apply around text part in username / hashtag / cashtag links 231 | @url_target the value for target attribute on URL links. 232 | @link_attribute_transform function to modify the attributes of a link based on the entity. called with (entity, attributes) params, and should modify the attributes hash. 233 | @link_text_transform function to modify the text of a link based on the entity. called with (entity, text) params, and should return a modified text. 234 | """ 235 | return self.auto_link_entities(self.extractor.extract_urls_with_indices({'extract_url_without_protocol': False}), options) 236 | 237 | # begin private methods 238 | def _html_escape(self, text): 239 | for char in HTML_ENTITIES: 240 | text = text.replace(char, HTML_ENTITIES[char]) 241 | return text 242 | 243 | def _extract_html_attrs_from_options(self, options = {}): 244 | html_attrs = options.get('html_attrs', {}) 245 | options = options.copy() 246 | if 'html_attrs' in options: 247 | del(options['html_attrs']) 248 | for option in options.keys(): 249 | if not option in OPTIONS_NOT_ATTRIBUTES: 250 | html_attrs[option] = options[option] 251 | return html_attrs 252 | 253 | def _url_entities_hash(self, url_entities): 254 | entities = {} 255 | for entity in url_entities: 256 | entities[entity.get('url')] = entity 257 | return entities 258 | 259 | def _link_to_url(self, entity, chars, options = {}): 260 | url = entity.get('url') 261 | 262 | href = options.get('link_url_transform', lambda x: x)(url) 263 | 264 | # NOTE auto link to urls do not use any default values and options 265 | # like url_class but use suppress_no_follow. 266 | html_attrs = self._extract_html_attrs_from_options(options) 267 | if options.get('url_class'): 268 | html_attrs['class'] = options.get('url_class') 269 | 270 | # add target attribute only if @url_target is specified 271 | if options.get('url_target'): 272 | html_attrs['target'] = options.get('url_target') 273 | 274 | url_entities = self._url_entities_hash(options.get('url_entities', {})) 275 | 276 | # use entity from @url_entities if available 277 | url_entity = url_entities.get(url, entity) 278 | if url_entity.get('display_url'): 279 | html_attrs['title'] = url_entity.get('expanded_url') 280 | link_text = self._link_url_with_entity(url_entity, options) 281 | else: 282 | link_text = self._html_escape(url) 283 | 284 | link = self._link_to_text(entity, link_text, href, html_attrs, options) 285 | return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] 286 | 287 | def _link_url_with_entity(self, entity, options = {}): 288 | """ 289 | Goal: If a user copies and pastes a tweet containing t.co'ed link, the resulting paste 290 | should contain the full original URL (expanded_url), not the display URL. 291 | 292 | Method: Whenever possible, we actually emit HTML that contains expanded_url, and use 293 | font-size:0 to hide those parts that should not be displayed (because they are not part of display_url). 294 | Elements with font-size:0 get copied even though they are not visible. 295 | Note that display:none doesn't work here. Elements with display:none don't get copied. 296 | 297 | Additionally, we want to *display* ellipses, but we don't want them copied. To make this happen we 298 | wrap the ellipses in a tco-ellipsis class and provide an onCopy handler that sets display:none on 299 | everything with the tco-ellipsis class. 300 | 301 | Exception: pic.twitter.com images, for which expandedUrl = "https://twitter.com/#!/username/status/1234/photo/1 302 | For those URLs, display_url is not a substring of expanded_url, so we don't do anything special to render the elided parts. 303 | For a pic.twitter.com URL, the only elided part will be the "https://", so this is fine. 304 | """ 305 | display_url = entity.get('display_url').decode('utf-8') 306 | expanded_url = entity.get('expanded_url') 307 | invisible_tag_attrs = options.get('invisible_tag_attrs', DEFAULT_INVISIBLE_TAG_ATTRS) 308 | 309 | display_url_sans_ellipses = re.sub(ur'…', u'', display_url) 310 | 311 | if expanded_url.find(display_url_sans_ellipses) > -1: 312 | before_display_url, after_display_url = expanded_url.split(display_url_sans_ellipses, 2) 313 | preceding_ellipsis = re.search(ur'\A…', display_url) 314 | following_ellipsis = re.search(ur'…\z', display_url) 315 | if preceding_ellipsis is not None: 316 | preceding_ellipsis = preceding_ellipsis.group() 317 | else: 318 | preceding_ellipsis = '' 319 | if following_ellipsis is not None: 320 | following_ellipsis = following_ellipsis.group() 321 | else: 322 | following_ellipsis = '' 323 | 324 | # As an example: The user tweets "hi http://longdomainname.com/foo" 325 | # This gets shortened to "hi http://t.co/xyzabc", with display_url = "…nname.com/foo" 326 | # This will get rendered as: 327 | # 328 | # … 329 | # 337 | # http://longdomai 338 | # 339 | # 340 | # nname.com/foo 341 | # 342 | # 343 | #   344 | # … 345 | # 346 | 347 | return u"%s %s%s%s %s" % (preceding_ellipsis, invisible_tag_attrs, invisible_tag_attrs, self._html_escape(before_display_url), self._html_escape(display_url_sans_ellipses), invisible_tag_attrs, self._html_escape(after_display_url), invisible_tag_attrs, following_ellipsis) 348 | else: 349 | return self._html_escape(display_url) 350 | 351 | def _link_to_hashtag(self, entity, chars, options = {}): 352 | hashchar = chars[entity['indices'][0]] 353 | hashtag = entity['hashtag'] 354 | hashtag_class = options.get('hashtag_class') 355 | 356 | if REGEXEN['rtl_chars'].search(hashtag): 357 | hashtag_class += ' rtl' 358 | 359 | href = options.get('hashtag_url_transform', lambda ht: u'%s%s' % (options.get('hashtag_url_base'), ht))(hashtag) 360 | 361 | html_attrs = {} 362 | html_attrs.update(options.get('html_attrs', {})) 363 | html_attrs = { 364 | 'class': hashtag_class, 365 | 'title': u'#%s' % hashtag, 366 | } 367 | 368 | link = self._link_to_text_with_symbol(entity, hashchar, hashtag, href, html_attrs, options) 369 | return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] 370 | 371 | def _link_to_cashtag(self, entity, chars, options = {}): 372 | dollar = chars[entity['indices'][0]] 373 | cashtag = entity['cashtag'] 374 | 375 | href = options.get('cashtag_url_transform', lambda ct: u'%s%s' % (options.get('cashtag_url_base'), ct))(cashtag) 376 | 377 | html_attrs = { 378 | 'class': options.get('cashtag_class'), 379 | 'title': u'$%s' % cashtag 380 | } 381 | html_attrs.update(options.get('html_attrs', {})) 382 | 383 | link = self._link_to_text_with_symbol(entity, dollar, cashtag, href, html_attrs, options) 384 | return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] 385 | 386 | def _link_to_screen_name(self, entity, chars, options = {}): 387 | name = u'%s%s' % (entity['screen_name'], entity.get('list_slug') or '') 388 | chunk = options.get('link_text_transform', default_transform)(entity, name) 389 | name = name.lower() 390 | 391 | at = chars[entity['indices'][0]] 392 | 393 | html_attrs = options.get('html_attrs', {}).copy() 394 | if 'title' in html_attrs: 395 | del(html_attrs['title']) 396 | 397 | if entity.get('list_slug') and not options.get('supress_lists'): 398 | href = options.get('list_url_transform', lambda sn: u'%s%s' % (options.get('list_url_base'), sn))(name) 399 | html_attrs['class'] = options.get('list_class') 400 | else: 401 | href = options.get('username_url_transform', lambda sn: u'%s%s' % (options.get('username_url_base'), sn))(name) 402 | html_attrs['class'] = options.get('username_class') 403 | 404 | link = self._link_to_text_with_symbol(entity, at, chunk, href, html_attrs, options) 405 | return chars[:entity['indices'][0]] + link + chars[entity['indices'][1]:] 406 | 407 | def _link_to_text_with_symbol(self, entity, symbol, text, href, attributes = {}, options = {}): 408 | tagged_symbol = u'<%s>%s' % (options.get('symbol_tag'), symbol, options.get('symbol_tag')) if options.get('symbol_tag') else symbol 409 | text = self._html_escape(text) 410 | tagged_text = u'<%s>%s' % (options.get('text_with_symbol_tag'), text, options.get('text_with_symbol_tag')) if options.get('text_with_symbol_tag') else text 411 | if options.get('username_include_symbol') or not REGEXEN['at_signs'].match(symbol): 412 | return u'%s' % self._link_to_text(entity, tagged_symbol + tagged_text, href, attributes, options) 413 | else: 414 | return u'%s%s' % (tagged_symbol, self._link_to_text(entity, tagged_text, href, attributes, options)) 415 | 416 | def _link_to_text(self, entity, text, href, attributes = {}, options = {}): 417 | attributes['href'] = href 418 | if options.get('link_attribute_transform'): 419 | attributes = options.get('link_attribute_transform')(entity, attributes) 420 | text = options.get('link_text_transform', default_transform)(entity, text) 421 | return u'%s' % (self._tag_attrs(attributes), text) 422 | 423 | def _tag_attrs(self, attributes = {}): 424 | attrs = [] 425 | for key in sorted(attributes.keys()): 426 | value = attributes[key] 427 | if key in BOOLEAN_ATTRIBUTES: 428 | attrs.append(key) 429 | continue 430 | if type(value) == list: 431 | value = u' '.join(value) 432 | attrs.append(u'%s="%s"' % (self._html_escape(key), self._html_escape(value))) 433 | 434 | return u' '.join(attrs) -------------------------------------------------------------------------------- /twitter_text/extractor.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | from twitter_text.regex import REGEXEN 4 | from twitter_text.unicode import force_unicode 5 | 6 | class Extractor(object): 7 | """ 8 | A module for including Tweet parsing in a class. This module provides function for the extraction and processing 9 | of usernames, lists, URLs and hashtags. 10 | """ 11 | 12 | def __init__(self, text): 13 | self.text = force_unicode(text) 14 | 15 | def _remove_overlapping_entities(self, entities): 16 | """ 17 | Remove overlapping entities. 18 | This returns a new list with no overlapping entities. 19 | """ 20 | 21 | # sort by start index 22 | entities.sort(key = lambda entity: entity['indices'][0]) 23 | 24 | # remove duplicates 25 | prev = None 26 | for entity in [e for e in entities]: 27 | if prev and prev['indices'][1] > entity['indices'][0]: 28 | entities.remove(entity) 29 | else: 30 | prev = entity 31 | return entities 32 | 33 | def extract_entities_with_indices(self, options = {}, transform = lambda x: x): 34 | """ 35 | Extracts all usernames, lists, hashtags and URLs in the Tweet text 36 | along with the indices for where the entity ocurred 37 | If the text is None or contains no entity an empty list 38 | will be returned. 39 | 40 | If a transform is given then it will be called for each entity. 41 | """ 42 | if not self.text: 43 | return [] 44 | 45 | # extract all entities 46 | entities = self.extract_urls_with_indices(options) + \ 47 | self.extract_hashtags_with_indices({'check_url_overlap': False}) + \ 48 | self.extract_mentions_or_lists_with_indices() + \ 49 | self.extract_cashtags_with_indices() 50 | 51 | entities = self._remove_overlapping_entities(entities) 52 | 53 | for entity in entities: 54 | entity = transform(entity) 55 | 56 | return entities 57 | 58 | def extract_mentioned_screen_names(self, transform = lambda x: x): 59 | """ 60 | Extracts a list of all usernames mentioned in the Tweet text. If the 61 | text is None or contains no username mentions an empty list 62 | will be returned. 63 | 64 | If a transform is given then it will be called for each username. 65 | """ 66 | return [transform(mention['screen_name']) for mention in self.extract_mentioned_screen_names_with_indices()] 67 | 68 | def extract_mentioned_screen_names_with_indices(self, transform = lambda x: x): 69 | """ 70 | Extracts a list of all usernames mentioned in the Tweet text 71 | along with the indices for where the mention ocurred. If the 72 | text is None or contains no username mentions, an empty list 73 | will be returned. 74 | 75 | If a transform is given, then it will be called with each username, the start 76 | index, and the end index in the text. 77 | """ 78 | if not self.text: 79 | return [] 80 | 81 | possible_screen_names = [] 82 | for match in self.extract_mentions_or_lists_with_indices(): 83 | if not match['list_slug']: 84 | possible_screen_names.append({ 85 | 'screen_name': transform(match['screen_name']), 86 | 'indices': match['indices'] 87 | }) 88 | return possible_screen_names 89 | 90 | def extract_mentions_or_lists_with_indices(self, transform = lambda x: x): 91 | """ 92 | Extracts a list of all usernames or lists mentioned in the Tweet text 93 | along with the indices for where the mention ocurred. If the 94 | text is None or contains no username or list mentions, an empty list 95 | will be returned. 96 | 97 | If a transform is given, then it will be called with each username, list slug, the start 98 | index, and the end index in the text. The list_slug will be an empty stirng 99 | if this is a username mention. 100 | """ 101 | if not REGEXEN['at_signs'].search(self.text): 102 | return [] 103 | 104 | possible_entries = [] 105 | for match in REGEXEN['valid_mention_or_list'].finditer(self.text): 106 | try: 107 | after = self.text[match.end()] 108 | except IndexError: 109 | # the mention was the last character in the string 110 | after = None 111 | if after and REGEXEN['end_mention_match'].match(after) or match.groups()[2].find('http') == 0: 112 | continue 113 | possible_entries.append({ 114 | 'screen_name': transform(match.groups()[2]), 115 | 'list_slug': match.groups()[3] or '', 116 | 'indices': [match.start() + len(match.groups()[0]), match.end()] 117 | }) 118 | 119 | return possible_entries 120 | 121 | def extract_reply_screen_name(self, transform = lambda x: x): 122 | """ 123 | Extracts the username username replied to in the Tweet text. If the 124 | text is None or is not a reply None will be returned. 125 | 126 | If a transform is given then it will be called with the username replied to (if any) 127 | """ 128 | if not self.text: 129 | return None 130 | 131 | possible_screen_name = REGEXEN['valid_reply'].match(self.text) 132 | if possible_screen_name is not None: 133 | if possible_screen_name.group(1).find('http') > -1: 134 | possible_screen_name = None 135 | else: 136 | possible_screen_name = transform(possible_screen_name.group(1)) 137 | return possible_screen_name 138 | 139 | def extract_urls(self, transform = lambda x: x): 140 | """ 141 | Extracts a list of all URLs included in the Tweet text. If the 142 | text is None or contains no URLs an empty list 143 | will be returned. 144 | 145 | If a transform is given then it will be called for each URL. 146 | """ 147 | return [transform(url['url']) for url in self.extract_urls_with_indices()] 148 | 149 | def extract_urls_with_indices(self, options = {'extract_url_without_protocol': True}): 150 | """ 151 | Extracts a list of all URLs included in the Tweet text along 152 | with the indices. If the text is None or contains no 153 | URLs an empty list will be returned. 154 | 155 | If a block is given then it will be called for each URL. 156 | """ 157 | urls = [] 158 | for match in REGEXEN['valid_url'].finditer(self.text): 159 | complete, before, url, protocol, domain, port, path, query = match.groups() 160 | start_position = match.start() + len(before or '') 161 | end_position = match.end() 162 | # If protocol is missing and domain contains non-ASCII characters, 163 | # extract ASCII-only domains. 164 | if not protocol: 165 | if not options.get('extract_url_without_protocol') or REGEXEN['invalid_url_without_protocol_preceding_chars'].search(before): 166 | continue 167 | last_url = None 168 | last_url_invalid_match = None 169 | for ascii_domain in REGEXEN['valid_ascii_domain'].finditer(domain): 170 | ascii_domain = ascii_domain.group() 171 | last_url = { 172 | 'url': ascii_domain, 173 | 'indices': [start_position - len(before or '') + complete.find(ascii_domain), start_position - len(before or '') + complete.find(ascii_domain) + len(ascii_domain)] 174 | } 175 | last_url_invalid_match = REGEXEN['invalid_short_domain'].search(ascii_domain) is not None 176 | if not last_url_invalid_match: 177 | urls.append(last_url) 178 | # no ASCII-only domain found. Skip the entire URL 179 | if not last_url: 180 | continue 181 | if path: 182 | last_url['url'] = url.replace(domain, last_url['url']) 183 | last_url['indices'][1] = end_position 184 | if last_url_invalid_match: 185 | urls.append(last_url) 186 | else: 187 | if REGEXEN['valid_tco_url'].match(url): 188 | url = REGEXEN['valid_tco_url'].match(url).group() 189 | end_position = start_position + len(url) 190 | urls.append({ 191 | 'url': url, 192 | 'indices': [start_position, end_position] 193 | }) 194 | return urls 195 | 196 | def extract_hashtags(self, transform = lambda x: x): 197 | """ 198 | Extracts a list of all hashtags included in the Tweet text. If the 199 | text is None or contains no hashtags an empty list 200 | will be returned. The list returned will not include the leading # 201 | character. 202 | 203 | If a block is given then it will be called for each hashtag. 204 | """ 205 | return [transform(hashtag['hashtag']) for hashtag in self.extract_hashtags_with_indices()] 206 | 207 | def extract_hashtags_with_indices(self, options = {'check_url_overlap': True}, transform = lambda x: x): 208 | """ 209 | Extracts a list of all hashtags included in the Tweet text. If the 210 | text is None or contains no hashtags an empty list 211 | will be returned. The list returned will not include the leading # 212 | character. 213 | 214 | If a block is given then it will be called for each hashtag. 215 | """ 216 | tags = [] 217 | for match in REGEXEN['valid_hashtag'].finditer(self.text): 218 | before, hashchar, hashtext = match.groups() 219 | start_position, end_position = match.span() 220 | start_position = start_position + len(before) 221 | if not (REGEXEN['end_hashtag_match'].match(self.text[end_position]) if len(self.text) > end_position else None) and not hashtext.find('http') == 0 and not REGEXEN['numeric_only'].match(hashtext): 222 | tags.append({ 223 | 'hashtag': hashtext, 224 | 'indices': [start_position, end_position] 225 | }) 226 | 227 | if options.get('check_url_overlap'): 228 | urls = self.extract_urls_with_indices() 229 | if len(urls): 230 | tags = tags + urls 231 | # remove duplicates 232 | tags = self._remove_overlapping_entities(tags) 233 | tags = [tag for tag in tags if 'hashtag' in tag] 234 | 235 | return tags 236 | 237 | def extract_cashtags(self, transform = lambda x: x): 238 | """ 239 | Extracts a list of all cashtags included in the Tweet text. If the 240 | text is None or contains no cashtags an empty list 241 | will be returned. The list returned will not include the leading $ 242 | character. 243 | 244 | If a block is given then it will be called for each cashtag. 245 | """ 246 | return [cashtag['cashtag'] for cashtag in self.extract_cashtags_with_indices()] 247 | 248 | def extract_cashtags_with_indices(self, transform = lambda x: x): 249 | """ 250 | Extracts a list of all cashtags included in the Tweet text. If the 251 | text is None or contains no cashtags an empty list 252 | will be returned. The list returned will not include the leading $ 253 | character. 254 | 255 | If a block is given then it will be called for each cashtag. 256 | """ 257 | if not self.text or self.text.find('$') == -1: 258 | return [] 259 | 260 | tags = [] 261 | for match in REGEXEN['valid_cashtag'].finditer(self.text): 262 | before, dollar, cashtext = match.groups() 263 | start_position, end_position = match.span() 264 | start_position = start_position + len(before or '') 265 | tags.append({ 266 | 'cashtag': cashtext, 267 | 'indices': [start_position, end_position] 268 | }) 269 | 270 | return tags -------------------------------------------------------------------------------- /twitter_text/highlighter.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import re 4 | from HTMLParser import HTMLParser 5 | 6 | from twitter_text.regex import UNICODE_SPACES 7 | from twitter_text.unicode import force_unicode 8 | 9 | DEFAULT_HIGHLIGHT_TAG = 'em' 10 | 11 | # from http://stackoverflow.com/questions/753052/strip-html-from-strings-in-python 12 | class MLStripper(HTMLParser): 13 | def __init__(self): 14 | self.reset() 15 | self.fed = [] 16 | def handle_data(self, d): 17 | self.fed.append(d) 18 | def get_data(self): 19 | return ''.join(self.fed) 20 | 21 | def strip_tags(html): 22 | s = MLStripper() 23 | s.feed(html) 24 | return s.get_data() 25 | 26 | class HitHighlighter(object): 27 | def __init__(self, text, **kwargs): 28 | self.text = force_unicode(text) 29 | self.parent = kwargs.get('parent', False) 30 | 31 | def hit_highlight(self, hits = [], **kwargs): 32 | if not hits and not kwargs.get('query'): 33 | return self.text 34 | 35 | if not hits and kwargs.get('query'): 36 | stripped_text = strip_tags(self.text) 37 | for match in re.finditer(ur'%s' % kwargs.get('query'), stripped_text): 38 | hits.append(match.span()) 39 | 40 | if hits and not type(hits) == list: 41 | raise Exception('The syntax for the hit_highlight method has changed. You must pass in a list of lists containing the indices of the strings you want to match.') 42 | 43 | tag_name = kwargs.get('tag', DEFAULT_HIGHLIGHT_TAG) 44 | tags = [u'<%s>' % tag_name, u'' % tag_name] 45 | 46 | text = self.text 47 | chunks = re.split(r'[<>]', text) 48 | text_chunks = [] 49 | for index, chunk in enumerate(chunks): 50 | if not index % 2: 51 | text_chunks.append(chunk) 52 | for hit in sorted(hits, key = lambda chunk: chunk[1], reverse = True): 53 | hit_start, hit_end = hit 54 | placed = 0 55 | for index, chunk in enumerate(chunks): 56 | if placed == 2: 57 | continue 58 | if index % 2: 59 | # we're inside a 60 | continue 61 | chunk_start = len(u''.join(text_chunks[0:index / 2])) 62 | chunk_end = chunk_start + len(chunk) 63 | if hit_start >= chunk_start and hit_start < chunk_end: 64 | chunk = chunk[:hit_start - chunk_start] + tags[0] + chunk[hit_start - chunk_start:] 65 | if hit_end <= chunk_end: 66 | hit_end += len(tags[0]) 67 | chunk_end += len(tags[0]) 68 | placed = 1 69 | if hit_end > chunk_start and hit_end <= chunk_end: 70 | chunk = chunk[:hit_end - chunk_start] + tags[1] + chunk[hit_end - chunk_start:] 71 | placed = 2 72 | chunks[index] = chunk 73 | if placed == 1: 74 | chunks[-1] = chunks[-1] + tags[1] 75 | result = [] 76 | for index, chunk in enumerate(chunks): 77 | if index % 2: 78 | # we're inside a 79 | result.append(u'<%s>' % chunk) 80 | else: 81 | result.append(chunk) 82 | self.text = u''.join(result) 83 | return self.text -------------------------------------------------------------------------------- /twitter_text/regex.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | # A collection of regular expressions for parsing Tweet text. The regular expression 4 | # list is frozen at load time to ensure immutability. These reular expressions are 5 | # used throughout the Twitter classes. Special care has been taken to make 6 | # sure these reular expressions work with Tweets in all languages. 7 | import re, string 8 | 9 | REGEXEN = {} # :nodoc: 10 | 11 | def regex_range(start, end = None): 12 | if end: 13 | return u'%s-%s' % (unichr(start), unichr(end)) 14 | else: 15 | return u'%s' % unichr(start) 16 | 17 | # Space is more than %20, U+3000 for example is the full-width space used with Kanji. Provide a short-hand 18 | # to access both the list of characters and a pattern suitible for use with String#split 19 | # Taken from: ActiveSupport::Multibyte::Handlers::UTF8Handler::UNICODE_WHITESPACE 20 | UNICODE_SPACES = [] 21 | for space in reduce(lambda x,y: x + y if type(y) == list else x + [y], [ 22 | range(0x0009, 0x000D), # White_Space # Cc [5] .. 23 | 0x0020, # White_Space # Zs SPACE 24 | 0x0085, # White_Space # Cc 25 | 0x00A0, # White_Space # Zs NO-BREAK SPACE 26 | 0x1680, # White_Space # Zs OGHAM SPACE MARK 27 | 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR 28 | range(0x2000, 0x200A), # White_Space # Zs [11] EN QUAD..HAIR SPACE 29 | 0x2028, # White_Space # Zl LINE SEPARATOR 30 | 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR 31 | 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE 32 | 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE 33 | 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE 34 | ]): 35 | UNICODE_SPACES.append(unichr(space)) 36 | REGEXEN['spaces'] = re.compile(ur''.join(UNICODE_SPACES)) 37 | 38 | # Characters not allowed in Tweets 39 | INVALID_CHARACTERS = [ 40 | 0xFFFE, 0xFEFF, # BOM 41 | 0xFFFF, # Special 42 | 0x202A, 0x202B, 0x202C, 0x202D, 0x202E, # Directional change 43 | ] 44 | REGEXEN['invalid_control_characters'] = [unichr(x) for x in INVALID_CHARACTERS] 45 | 46 | REGEXEN['list_name'] = re.compile(ur'^[a-zA-Z][a-zA-Z0-9_\-\u0080-\u00ff]{0,24}$') 47 | 48 | # Latin accented characters 49 | # Excludes 0xd7 from the range (the multiplication sign, confusable with "x"). 50 | # Also excludes 0xf7, the division sign 51 | LATIN_ACCENTS = [ 52 | regex_range(0x00c0, 0x00d6), 53 | regex_range(0x00d8, 0x00f6), 54 | regex_range(0x00f8, 0x00ff), 55 | regex_range(0x0100, 0x024f), 56 | regex_range(0x0253, 0x0254), 57 | regex_range(0x0256, 0x0257), 58 | regex_range(0x0259), 59 | regex_range(0x025b), 60 | regex_range(0x0263), 61 | regex_range(0x0268), 62 | regex_range(0x026f), 63 | regex_range(0x0272), 64 | regex_range(0x0289), 65 | regex_range(0x028b), 66 | regex_range(0x02bb), 67 | regex_range(0x0300, 0x036f), 68 | regex_range(0x1e00, 0x1eff), 69 | ] 70 | REGEXEN['latin_accents'] = re.compile(ur''.join(LATIN_ACCENTS), re.IGNORECASE | re.UNICODE) 71 | LATIN_ACCENTS = u''.join(LATIN_ACCENTS) 72 | 73 | RTL_CHARACTERS = ''.join([ 74 | regex_range(0x0600,0x06FF), 75 | regex_range(0x0750,0x077F), 76 | regex_range(0x0590,0x05FF), 77 | regex_range(0xFE70,0xFEFF) 78 | ]) 79 | 80 | NON_LATIN_HASHTAG_CHARS = ''.join([ 81 | # Cyrillic (Russian, Ukrainian, etc.) 82 | regex_range(0x0400, 0x04ff), # Cyrillic 83 | regex_range(0x0500, 0x0527), # Cyrillic Supplement 84 | regex_range(0x2de0, 0x2dff), # Cyrillic Extended A 85 | regex_range(0xa640, 0xa69f), # Cyrillic Extended B 86 | regex_range(0x0591, 0x05bf), # Hebrew 87 | regex_range(0x05c1, 0x05c2), 88 | regex_range(0x05c4, 0x05c5), 89 | regex_range(0x05c7), 90 | regex_range(0x05d0, 0x05ea), 91 | regex_range(0x05f0, 0x05f4), 92 | regex_range(0xfb12, 0xfb28), # Hebrew Presentation Forms 93 | regex_range(0xfb2a, 0xfb36), 94 | regex_range(0xfb38, 0xfb3c), 95 | regex_range(0xfb3e), 96 | regex_range(0xfb40, 0xfb41), 97 | regex_range(0xfb43, 0xfb44), 98 | regex_range(0xfb46, 0xfb4f), 99 | regex_range(0x0610, 0x061a), # Arabic 100 | regex_range(0x0620, 0x065f), 101 | regex_range(0x066e, 0x06d3), 102 | regex_range(0x06d5, 0x06dc), 103 | regex_range(0x06de, 0x06e8), 104 | regex_range(0x06ea, 0x06ef), 105 | regex_range(0x06fa, 0x06fc), 106 | regex_range(0x06ff), 107 | regex_range(0x0750, 0x077f), # Arabic Supplement 108 | regex_range(0x08a0), # Arabic Extended A 109 | regex_range(0x08a2, 0x08ac), 110 | regex_range(0x08e4, 0x08fe), 111 | regex_range(0xfb50, 0xfbb1), # Arabic Pres. Forms A 112 | regex_range(0xfbd3, 0xfd3d), 113 | regex_range(0xfd50, 0xfd8f), 114 | regex_range(0xfd92, 0xfdc7), 115 | regex_range(0xfdf0, 0xfdfb), 116 | regex_range(0xfe70, 0xfe74), # Arabic Pres. Forms B 117 | regex_range(0xfe76, 0xfefc), 118 | regex_range(0x200c, 0x200c), # Zero-Width Non-Joiner 119 | regex_range(0x0e01, 0x0e3a), # Thai 120 | regex_range(0x0e40, 0x0e4e), # Hangul (Korean) 121 | regex_range(0x1100, 0x11ff), # Hangul Jamo 122 | regex_range(0x3130, 0x3185), # Hangul Compatibility Jamo 123 | regex_range(0xA960, 0xA97F), # Hangul Jamo Extended-A 124 | regex_range(0xAC00, 0xD7AF), # Hangul Syllables 125 | regex_range(0xD7B0, 0xD7FF), # Hangul Jamo Extended-B 126 | regex_range(0xFFA1, 0xFFDC) # Half-width Hangul 127 | ]) 128 | 129 | CJ_HASHTAG_CHARACTERS = ''.join([ 130 | regex_range(0x30A1, 0x30FA), regex_range(0x30FC, 0x30FE), # Katakana (full-width) 131 | regex_range(0xFF66, 0xFF9F), # Katakana (half-width) 132 | regex_range(0xFF10, 0xFF19), regex_range(0xFF21, 0xFF3A), regex_range(0xFF41, 0xFF5A), # Latin (full-width) 133 | regex_range(0x3041, 0x3096), regex_range(0x3099, 0x309E), # Hiragana 134 | regex_range(0x3400, 0x4DBF), # Kanji (CJK Extension A) 135 | regex_range(0x4E00, 0x9FFF), # Kanji (Unified) 136 | ]) 137 | 138 | try: 139 | CJ_HASHTAG_CHARACTERS = ''.join([ 140 | CJ_HASHTAG_CHARACTERS, 141 | regex_range(0x20000, 0x2A6DF), # Kanji (CJK Extension B) 142 | regex_range(0x2A700, 0x2B73F), # Kanji (CJK Extension C) 143 | regex_range(0x2B740, 0x2B81F), # Kanji (CJK Extension D) 144 | regex_range(0x2F800, 0x2FA1F), regex_range(0x3003), regex_range(0x3005), regex_range(0x303B) # Kanji (CJK supplement) 145 | ]) 146 | except ValueError: 147 | # this is a narrow python build so these extended Kanji characters won't work 148 | pass 149 | 150 | PUNCTUATION_CHARS = ur'!"#$%&\'()*+,-./:;<=>?@\[\]^_\`{|}~' 151 | SPACE_CHARS = ur" \t\n\x0B\f\r" 152 | CTRL_CHARS = ur"\x00-\x1F\x7F" 153 | 154 | # A hashtag must contain latin characters, numbers and underscores, but not all numbers. 155 | HASHTAG_ALPHA = ur'[a-z_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) 156 | HASHTAG_ALPHANUMERIC = ur'[a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) 157 | HASHTAG_BOUNDARY = ur'\A|\z|\[|[^&a-z0-9_%s]' % (LATIN_ACCENTS + NON_LATIN_HASHTAG_CHARS + CJ_HASHTAG_CHARACTERS) 158 | 159 | HASHTAG = re.compile(ur'(%s)(#|#)(%s*%s%s*)' % (HASHTAG_BOUNDARY, HASHTAG_ALPHANUMERIC, HASHTAG_ALPHA, HASHTAG_ALPHANUMERIC), re.IGNORECASE) 160 | 161 | REGEXEN['valid_hashtag'] = HASHTAG 162 | REGEXEN['end_hashtag_match'] = re.compile(ur'\A(?:[##]|:\/\/)', re.IGNORECASE | re.UNICODE) 163 | REGEXEN['numeric_only'] = re.compile(ur'^[\d]+$') 164 | 165 | REGEXEN['valid_mention_preceding_chars'] = re.compile(r'(?:[^a-zA-Z0-9_!#\$%&*@@]|^|RT:?)') 166 | REGEXEN['at_signs'] = re.compile(ur'[@@]') 167 | REGEXEN['valid_mention_or_list'] = re.compile( 168 | ur'(%s)' % REGEXEN['valid_mention_preceding_chars'].pattern.decode('utf-8') + # preceding character 169 | ur'(%s)' % REGEXEN['at_signs'].pattern + # at mark 170 | ur'([a-zA-Z0-9_]{1,20})' + # screen name 171 | ur'(\/[a-zA-Z][a-zA-Z0-9_\-]{0,24})?' # list (optional) 172 | ) 173 | REGEXEN['valid_reply'] = re.compile(ur'^(?:[%s])*%s([a-zA-Z0-9_]{1,20})' % (REGEXEN['spaces'].pattern, REGEXEN['at_signs'].pattern), re.IGNORECASE | re.UNICODE) 174 | # Used in Extractor for final filtering 175 | REGEXEN['end_mention_match'] = re.compile(ur'\A(?:%s|[%s]|:\/\/)' % (REGEXEN['at_signs'].pattern, REGEXEN['latin_accents'].pattern), re.IGNORECASE | re.UNICODE) 176 | 177 | # URL related hash regex collection 178 | REGEXEN['valid_url_preceding_chars'] = re.compile(ur'(?:[^A-Z0-9@@$##%s]|^)' % ur''.join(REGEXEN['invalid_control_characters']), re.IGNORECASE | re.UNICODE) 179 | REGEXEN['invalid_url_without_protocol_preceding_chars'] = re.compile(ur'[-_.\/]$') 180 | DOMAIN_VALID_CHARS = ur'[^%s%s%s%s%s]' % (PUNCTUATION_CHARS, SPACE_CHARS, CTRL_CHARS, ur''.join(REGEXEN['invalid_control_characters']), ur''.join(UNICODE_SPACES)) 181 | REGEXEN['valid_subdomain'] = re.compile(ur'(?:(?:%s(?:[_-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) 182 | REGEXEN['valid_domain_name'] = re.compile(ur'(?:(?:%s(?:[-]|%s)*)?%s\.)' % (DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS, DOMAIN_VALID_CHARS), re.IGNORECASE | re.UNICODE) 183 | REGEXEN['valid_gTLD'] = re.compile(ur'(?:(?:academy|actor|aero|agency|arpa|asia|bar|bargains|berlin|best|bid|bike|biz|blue|boutique|build|builders|buzz|cab|camera|camp|cards|careers|cat|catering|center|ceo|cheap|christmas|cleaning|clothing|club|codes|coffee|com|community|company|computer|construction|contractors|cool|coop|cruises|dance|dating|democrat|diamonds|directory|domains|edu|education|email|enterprises|equipment|estate|events|expert|exposed|farm|fish|flights|florist|foundation|futbol|gallery|gift|glass|gov|graphics|guitars|guru|holdings|holiday|house|immobilien|industries|info|institute|int|international|jobs|kaufen|kim|kitchen|kiwi|koeln|kred|land|lighting|limo|link|luxury|management|mango|marketing|menu|mil|mobi|moda|monash|museum|nagoya|name|net|neustar|ninja|okinawa|onl|org|partners|parts|photo|photography|photos|pics|pink|plumbing|post|pro|productions|properties|pub|qpon|recipes|red|rentals|repair|report|reviews|rich|ruhr|sexy|shiksha|shoes|singles|social|solar|solutions|supplies|supply|support|systems|tattoo|technology|tel|tienda|tips|today|tokyo|tools|training|travel|uno|vacations|ventures|viajes|villas|vision|vote|voting|voto|voyage|wang|watch|wed|wien|wiki|works|xxx|xyz|zone|дети|онлайн|орг|сайт|بازار|شبكة|みんな|中信|中文网|公司|公>益|在线|我爱你|政务|游戏|移动|网络|集团|삼성)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) 184 | REGEXEN['valid_ccTLD'] = re.compile(ur'(?:(?:ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bl|bm|bn|bo|bq|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cw|cx|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mf|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|za|zm|zw|мон|рф|срб|укр|қаз|الاردن|الجزائر|السعودية|المغرب|امارات|ایران|بھارت|تونس|سودان|سورية|عمان|فلسطين|قطر|مصر|مليسيا|پاکستان|भारत|বাংলা|ভারত|ਭਾਰਤ|ભારત|இந்தியா|இலங்கை|சிங்கப்பூர்|భారత్|ලංකා|ไทย|გე|中国|中加坡|湾|台灣|新香港|한국)(?=[^0-9a-z]|$))', re.IGNORECASE | re.UNICODE) 185 | REGEXEN['valid_punycode'] = re.compile(ur'(?:xn--[0-9a-z]+)', re.IGNORECASE | re.UNICODE) 186 | 187 | REGEXEN['valid_domain'] = re.compile(ur'(?:%s*%s(?:%s|%s|%s))' % (REGEXEN['valid_subdomain'].pattern, REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) 188 | 189 | # This is used in Extractor 190 | REGEXEN['valid_ascii_domain'] = re.compile(ur'(?:(?:[A-Za-z0-9\-_]|[%s])+\.)+(?:%s|%s|%s)' % (REGEXEN['latin_accents'].pattern, REGEXEN['valid_gTLD'].pattern, REGEXEN['valid_ccTLD'].pattern, REGEXEN['valid_punycode'].pattern), re.IGNORECASE | re.UNICODE) 191 | 192 | # This is used in Extractor for stricter t.co URL extraction 193 | REGEXEN['valid_tco_url'] = re.compile(ur'^https?:\/\/t\.co\/[a-z0-9]+', re.IGNORECASE | re.UNICODE) 194 | 195 | # This is used in Extractor to filter out unwanted URLs. 196 | REGEXEN['invalid_short_domain'] = re.compile(ur'\A%s%s\Z' % (REGEXEN['valid_domain_name'].pattern, REGEXEN['valid_ccTLD'].pattern), re.IGNORECASE | re.UNICODE) 197 | 198 | REGEXEN['valid_port_number'] = re.compile(ur'[0-9]+') 199 | 200 | REGEXEN['valid_general_url_path_chars'] = re.compile(ur"[a-z0-9!\*';:=\+\,\.\$\/%%#\[\]\-_~&|@%s]" % LATIN_ACCENTS, re.IGNORECASE | re.UNICODE) 201 | # Allow URL paths to contain balanced parens 202 | # 1. Used in Wikipedia URLs like /Primer_(film) 203 | # 2. Used in IIS sessions like /S(dfd346)/ 204 | REGEXEN['valid_url_balanced_parens'] = re.compile(ur'\(%s+\)' % REGEXEN['valid_general_url_path_chars'].pattern, re.IGNORECASE | re.UNICODE) 205 | # Valid end-of-path chracters (so /foo. does not gobble the period). 206 | # 1. Allow =&# for empty URL parameters and other URL-join artifacts 207 | REGEXEN['valid_url_path_ending_chars'] = re.compile(ur'[a-z0-9=_#\/\+\-%s]|(?:%s)' % (LATIN_ACCENTS, REGEXEN['valid_url_balanced_parens'].pattern), re.IGNORECASE | re.UNICODE) 208 | REGEXEN['valid_url_path'] = re.compile(ur'(?:(?:%s*(?:%s %s*)*%s)|(?:%s+\/))' % (REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_balanced_parens'].pattern, REGEXEN['valid_general_url_path_chars'].pattern, REGEXEN['valid_url_path_ending_chars'].pattern, REGEXEN['valid_general_url_path_chars'].pattern), re.IGNORECASE | re.UNICODE) 209 | 210 | REGEXEN['valid_url_query_chars'] = re.compile(ur"[a-z0-9!?\*'\(\);:&=\+\$\/%#\[\]\-_\.,~|@]", re.IGNORECASE | re.UNICODE) 211 | REGEXEN['valid_url_query_ending_chars'] = re.compile(ur'[a-z0-9_&=#\/]', re.IGNORECASE | re.UNICODE) 212 | REGEXEN['valid_url'] = re.compile(ur'((%s)((https?:\/\/)?(%s)(?::(%s))?(/%s*)?(\?%s*%s)?))' % ( 213 | REGEXEN['valid_url_preceding_chars'].pattern, 214 | REGEXEN['valid_domain'].pattern, 215 | REGEXEN['valid_port_number'].pattern, 216 | REGEXEN['valid_url_path'].pattern, 217 | REGEXEN['valid_url_query_chars'].pattern, 218 | REGEXEN['valid_url_query_ending_chars'].pattern 219 | ), re.IGNORECASE | re.UNICODE) 220 | # Matches 221 | # $1 total match 222 | # $2 Preceeding chracter 223 | # $3 URL 224 | # $4 Protocol (optional) 225 | # $5 Domain(s) 226 | # $6 Port number (optional) 227 | # $7 URL Path and anchor 228 | # $8 Query String 229 | 230 | REGEXEN['cashtag'] = re.compile(ur'[a-z]{1,6}(?:[._][a-z]{1,2})?', re.IGNORECASE) 231 | REGEXEN['valid_cashtag'] = re.compile(ur'(^|[%s])(\$|$|﹩)(%s)(?=$|\s|[%s])' % (REGEXEN['spaces'].pattern, REGEXEN['cashtag'].pattern, PUNCTUATION_CHARS), re.IGNORECASE) 232 | 233 | # These URL validation pattern strings are based on the ABNF from RFC 3986 234 | REGEXEN['validate_url_unreserved'] = re.compile(ur'[a-z0-9\-._~]', re.IGNORECASE | re.UNICODE) 235 | REGEXEN['validate_url_pct_encoded'] = re.compile(ur'(?:%[0-9a-f]{2})', re.IGNORECASE | re.UNICODE) 236 | REGEXEN['validate_url_sub_delims'] = re.compile(ur"[!$&'()*+,;=]", re.IGNORECASE | re.UNICODE) 237 | REGEXEN['validate_url_pchar'] = re.compile(ur'(?:%s|%s|%s|[:\|@])' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) 238 | 239 | REGEXEN['validate_url_scheme'] = re.compile(ur'(?:[a-z][a-z0-9+\-.]*)', re.IGNORECASE | re.UNICODE) 240 | REGEXEN['validate_url_userinfo'] = re.compile(ur'(?:%s|%s|%s|:)*' % (REGEXEN['validate_url_unreserved'].pattern, REGEXEN['validate_url_pct_encoded'].pattern, REGEXEN['validate_url_sub_delims'].pattern), re.IGNORECASE | re.UNICODE) 241 | 242 | REGEXEN['validate_url_dec_octet'] = re.compile(ur'(?:[0-9]|(?:[1-9][0-9])|(?:1[0-9]{2})|(?:2[0-4][0-9])|(?:25[0-5]))', re.IGNORECASE | re.UNICODE) 243 | REGEXEN['validate_url_ipv4'] = re.compile(ur'(?:%s(?:\.%s){3})' % (REGEXEN['validate_url_dec_octet'].pattern, REGEXEN['validate_url_dec_octet'].pattern), re.IGNORECASE | re.UNICODE) 244 | 245 | # Punting on real IPv6 validation for now 246 | REGEXEN['validate_url_ipv6'] = re.compile(ur'(?:\[[a-f0-9:\.]+\])', re.IGNORECASE | re.UNICODE) 247 | 248 | # Also punting on IPvFuture for now 249 | REGEXEN['validate_url_ip'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ipv4'].pattern, REGEXEN['validate_url_ipv6'].pattern), re.IGNORECASE | re.UNICODE) 250 | 251 | # This is more strict than the rfc specifies 252 | REGEXEN['validate_url_subdomain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9_\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) 253 | REGEXEN['validate_url_domain_segment'] = re.compile(ur'(?:[a-z0-9](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) 254 | REGEXEN['validate_url_domain_tld'] = re.compile(ur'(?:[a-z](?:[a-z0-9\-]*[a-z0-9])?)', re.IGNORECASE | re.UNICODE) 255 | REGEXEN['validate_url_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_subdomain_segment'].pattern, REGEXEN['validate_url_domain_segment'].pattern, REGEXEN['validate_url_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) 256 | 257 | REGEXEN['validate_url_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_domain'].pattern), re.IGNORECASE | re.UNICODE) 258 | 259 | # Unencoded internationalized domains - this doesn't check for invalid UTF-8 sequences 260 | REGEXEN['validate_url_unicode_subdomain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9_\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) 261 | REGEXEN['validate_url_unicode_domain_segment'] = re.compile(ur'(?:(?:[a-z0-9]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) 262 | REGEXEN['validate_url_unicode_domain_tld'] = re.compile(ur'(?:(?:[a-z]|[^\x00-\x7f])(?:(?:[a-z0-9\-]|[^\x00-\x7f])*(?:[a-z0-9]|[^\x00-\x7f]))?)', re.IGNORECASE | re.UNICODE) 263 | REGEXEN['validate_url_unicode_domain'] = re.compile(ur'(?:(?:%s\.)*(?:%s\.)%s)' % (REGEXEN['validate_url_unicode_subdomain_segment'].pattern, REGEXEN['validate_url_unicode_domain_segment'].pattern, REGEXEN['validate_url_unicode_domain_tld'].pattern), re.IGNORECASE | re.UNICODE) 264 | 265 | REGEXEN['validate_url_unicode_host'] = re.compile(ur'(?:%s|%s)' % (REGEXEN['validate_url_ip'].pattern, REGEXEN['validate_url_unicode_domain'].pattern), re.IGNORECASE | re.UNICODE) 266 | 267 | REGEXEN['validate_url_port'] = re.compile(ur'[0-9]{1,5}') 268 | 269 | REGEXEN['validate_url_unicode_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_unicode_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) 270 | 271 | REGEXEN['validate_url_authority'] = re.compile(ur'(?:(%s)@)?(%s)(?::(%s))?' % (REGEXEN['validate_url_userinfo'].pattern, REGEXEN['validate_url_host'].pattern, REGEXEN['validate_url_port'].pattern), re.IGNORECASE | re.UNICODE) 272 | 273 | REGEXEN['validate_url_path'] = re.compile(ur'(/%s*)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) 274 | REGEXEN['validate_url_query'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) 275 | REGEXEN['validate_url_fragment'] = re.compile(ur'(%s|/|\?)*' % REGEXEN['validate_url_pchar'].pattern, re.IGNORECASE | re.UNICODE) 276 | 277 | # Modified version of RFC 3986 Appendix B 278 | REGEXEN['validate_url_unencoded'] = re.compile(ur'\A(?:([^:/?#]+)://)?([^/?#]*)([^?#]*)(?:\?([^#]*))?(?:\#(.*))?\Z', re.IGNORECASE | re.UNICODE) 279 | 280 | REGEXEN['rtl_chars'] = re.compile(ur'[%s]' % RTL_CHARACTERS, re.IGNORECASE | re.UNICODE) 281 | -------------------------------------------------------------------------------- /twitter_text/templatetags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dryan/twitter-text-py/143ee74751597efd782417df4773d586720428a4/twitter_text/templatetags/__init__.py -------------------------------------------------------------------------------- /twitter_text/templatetags/twitterize.py: -------------------------------------------------------------------------------- 1 | try: 2 | from django.template import Library 3 | from django.template.defaultfilters import stringfilter 4 | except: 5 | raise Exception('Django is not installed.') 6 | 7 | from twitter_text import TwitterText 8 | 9 | register = Library() 10 | 11 | @register.filter(name = 'twitter_text') 12 | @stringfilter 13 | def twitter_text(text, search_query = False): 14 | """ 15 | Parses a text string through the TwitterText auto_link method and if search_query is passed, through the hit_highlight method. 16 | """ 17 | tt = TwitterText(text) 18 | if search_query: 19 | tt.text = tt.highlighter.hit_highlight(query = search_query) 20 | tt.text = tt.autolink.auto_link() 21 | return tt.text 22 | twitter_text.is_safe = True -------------------------------------------------------------------------------- /twitter_text/unicode.py: -------------------------------------------------------------------------------- 1 | import types, datetime 2 | from decimal import Decimal 3 | 4 | # borrowed from django.utils.encoding 5 | class TwitterTextUnicodeDecodeError(UnicodeDecodeError): 6 | def __init__(self, obj, *args): 7 | self.obj = obj 8 | UnicodeDecodeError.__init__(self, *args) 9 | 10 | def __str__(self): 11 | original = UnicodeDecodeError.__str__(self) 12 | return '%s. You passed in %r (%s)' % (original, self.obj, 13 | type(self.obj)) 14 | 15 | def is_protected_type(obj): 16 | """Determine if the object instance is of a protected type. 17 | 18 | Objects of protected types are preserved as-is when passed to 19 | force_unicode(strings_only=True). 20 | """ 21 | return isinstance(obj, ( 22 | types.NoneType, 23 | int, long, 24 | datetime.datetime, datetime.date, datetime.time, 25 | float, Decimal) 26 | ) 27 | 28 | def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): 29 | """ 30 | Similar to smart_unicode, except that lazy instances are resolved to 31 | strings, rather than kept as lazy objects. 32 | 33 | If strings_only is True, don't convert (some) non-string-like objects. 34 | """ 35 | if strings_only and is_protected_type(s): 36 | return s 37 | try: 38 | if not isinstance(s, basestring,): 39 | if hasattr(s, '__unicode__'): 40 | s = unicode(s) 41 | else: 42 | try: 43 | s = unicode(str(s), encoding, errors) 44 | except UnicodeEncodeError: 45 | if not isinstance(s, Exception): 46 | raise 47 | # If we get to here, the caller has passed in an Exception 48 | # subclass populated with non-ASCII data without special 49 | # handling to display as a string. We need to handle this 50 | # without raising a further exception. We do an 51 | # approximation to what the Exception's standard str() 52 | # output should be. 53 | s = ' '.join([force_unicode(arg, encoding, strings_only, 54 | errors) for arg in s]) 55 | elif not isinstance(s, unicode): 56 | # Note: We use .decode() here, instead of unicode(s, encoding, 57 | # errors), so that if s is a SafeString, it ends up being a 58 | # SafeUnicode at the end. 59 | s = s.decode(encoding, errors) 60 | except UnicodeDecodeError, e: 61 | if not isinstance(s, Exception): 62 | raise TwitterTextUnicodeDecodeError(s, *e.args) 63 | else: 64 | # If we get to here, the caller has passed in an Exception 65 | # subclass populated with non-ASCII bytestring data without a 66 | # working unicode method. Try to handle this without raising a 67 | # further exception by individually forcing the exception args 68 | # to unicode. 69 | s = ' '.join([force_unicode(arg, encoding, strings_only, 70 | errors) for arg in s]) 71 | return s 72 | -------------------------------------------------------------------------------- /twitter_text/validation.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import re 4 | 5 | from twitter_text.unicode import force_unicode 6 | from twitter_text.extractor import Extractor 7 | from twitter_text.regex import REGEXEN 8 | 9 | MAX_LENGTH = 140 10 | 11 | DEFAULT_TCO_URL_LENGTHS = { 12 | 'short_url_length': 22, 13 | 'short_url_length_https': 23, 14 | 'characters_reserved_per_media': 22, 15 | } 16 | 17 | class Validation(object): 18 | def __init__(self, text, **kwargs): 19 | self.text = force_unicode(text) 20 | self.parent = kwargs.get('parent', False) 21 | 22 | def tweet_length(self, options = {}): 23 | """ 24 | Returns the length of the string as it would be displayed. This is equivilent to the length of the Unicode NFC 25 | (See: http://www.unicode.org/reports/tr15). This is needed in order to consistently calculate the length of a 26 | string no matter which actual form was transmitted. For example: 27 | 28 | U+0065 Latin Small Letter E 29 | + U+0301 Combining Acute Accent 30 | ---------- 31 | = 2 bytes, 2 characters, displayed as é (1 visual glyph) 32 | … The NFC of {U+0065, U+0301} is {U+00E9}, which is a single chracter and a +display_length+ of 1 33 | 34 | The string could also contain U+00E9 already, in which case the canonicalization will not change the value. 35 | """ 36 | 37 | assert (not self.parent or not getattr(self.parent, 'has_been_linked', False) ), 'The validator should only be run on text before it has been modified.' 38 | 39 | for key in DEFAULT_TCO_URL_LENGTHS: 40 | if not key in options: 41 | options[key] = DEFAULT_TCO_URL_LENGTHS[key] 42 | 43 | length = len(self.text) 44 | # thanks force_unicode for making this so much simpler than the ruby version 45 | 46 | for url in Extractor(self.text).extract_urls_with_indices(): 47 | # remove the link of the original URL 48 | length += url['indices'][0] - url['indices'][1] 49 | # add the length of the t.co URL that will replace it 50 | length += options.get('short_url_length_https') if url['url'].lower().find('https://') > -1 else options.get('short_url_length') 51 | 52 | if self.parent and hasattr(self.parent, 'tweet_length'): 53 | self.parent.tweet_length = length 54 | return length 55 | 56 | def tweet_invalid(self): 57 | """ 58 | Check the text for any reason that it may not be valid as a Tweet. This is meant as a pre-validation 59 | before posting to api.twitter.com. There are several server-side reasons for Tweets to fail but this pre-validation 60 | will allow quicker feedback. 61 | 62 | Returns false if this text is valid. Otherwise one of the following Symbols will be returned: 63 | 64 | "Too long":: if the text is too long 65 | "Empty text":: if the text is empty 66 | "Invalid characters":: if the text contains non-Unicode or any of the disallowed Unicode characters 67 | """ 68 | 69 | valid = True # optimism 70 | validation_error = None 71 | 72 | if not self.tweet_length(): 73 | valid, validation_error = False, 'Empty text' 74 | 75 | if self.tweet_length() > MAX_LENGTH: 76 | valid, validation_error = False, 'Too long' 77 | 78 | if re.search(ur''.join(REGEXEN['invalid_control_characters']), self.text): 79 | valid, validation_error = False, 'Invalid characters' 80 | 81 | if self.parent and hasattr(self.parent, 'tweet_is_valid'): 82 | self.parent.tweet_is_valid = valid 83 | if self.parent and hasattr(self.parent, 'tweet_validation_error'): 84 | self.parent.tweet_validation_error = validation_error 85 | 86 | return validation_error if not valid else False 87 | 88 | def valid_tweet_text(self): 89 | return not self.tweet_invalid() 90 | 91 | def valid_username(self): 92 | if not self.text: 93 | return False 94 | 95 | extracted = Extractor(self.text).extract_mentioned_screen_names() 96 | 97 | return len(extracted) == 1 and extracted[0] == self.text[1:] 98 | 99 | def valid_list(self): 100 | match = re.compile(ur'^%s$' % REGEXEN['valid_mention_or_list'].pattern).search(self.text) 101 | return bool(match is not None and match.groups()[0] == "" and match.groups()[3]) 102 | 103 | def valid_hashtag(self): 104 | if not self.text: 105 | return False 106 | 107 | extracted = Extractor(self.text).extract_hashtags() 108 | 109 | return len(extracted) == 1 and extracted[0] == self.text[1:] 110 | 111 | def valid_url(self, unicode_domains = True, require_protocol = True): 112 | if not self.text: 113 | return False 114 | 115 | url_parts = REGEXEN['validate_url_unencoded'].match(self.text) 116 | 117 | if not (url_parts and url_parts.string == self.text): 118 | return False 119 | 120 | scheme, authority, path, query, fragment = url_parts.groups() 121 | 122 | if not ( 123 | ( 124 | not require_protocol 125 | or ( 126 | self._valid_match(scheme, REGEXEN['validate_url_scheme']) 127 | and re.compile(ur'^https?$', re.IGNORECASE).match(scheme) 128 | ) 129 | ) 130 | and ( 131 | path == '' 132 | or self._valid_match(path, REGEXEN['validate_url_path']) 133 | ) 134 | and self._valid_match(query, REGEXEN['validate_url_query'], True) 135 | and self._valid_match(fragment, REGEXEN['validate_url_fragment'], True) 136 | ): 137 | return False 138 | 139 | return bool( 140 | ( 141 | unicode_domains 142 | and self._valid_match(authority, REGEXEN['validate_url_unicode_authority']) 143 | and REGEXEN['validate_url_unicode_authority'].match(authority).string == authority 144 | ) 145 | or ( 146 | not unicode_domains 147 | and self._valid_match(authority, REGEXEN['validate_url_authority']) 148 | and REGEXEN['validate_url_authority'].match(authority).string == authority 149 | ) 150 | ) 151 | 152 | def _valid_match(self, string, re_obj, optional = False): 153 | if optional and string is None: 154 | return True 155 | match = re_obj.match(string) 156 | if optional: 157 | return not (string and (match is None or not match.string[match.span()[0]:match.span()[1]] == string)) 158 | else: 159 | return bool(string and match and match.string[match.span()[0]:match.span()[1]] == string) 160 | --------------------------------------------------------------------------------