├── .gitignore ├── .nojekyll ├── LICENSE ├── README.rst ├── _config.yml ├── doc_build.sh ├── docs ├── Makefile ├── readme.md └── source │ ├── README.rst │ ├── conf.py │ ├── index.rst │ ├── tweet_parser.getter_methods.rst │ └── tweet_parser.rst ├── setup.py ├── test ├── test_tweet_parser.py └── tweet_payload_examples │ ├── activity_streams_examples.json │ ├── broken_and_unsupported_payloads │ ├── activity_streams_additional_field.json │ ├── activity_streams_missing_field.json │ ├── original_format_additional_field.json │ ├── original_format_missing_field.json │ ├── original_format_missing_quotetweet_field.json │ ├── original_format_missing_user.json │ └── public_api_sample.json │ └── original_format_examples.json ├── tools ├── demo_notebook.ipynb └── parse_tweets.py └── tweet_parser ├── __init__.py ├── deprecator.py ├── getter_methods ├── __init__.py ├── gnip_fields.py ├── tweet_counts.py ├── tweet_date.py ├── tweet_embeds.py ├── tweet_entities.py ├── tweet_generator.py ├── tweet_geo.py ├── tweet_links.py ├── tweet_reply.py ├── tweet_text.py └── tweet_user.py ├── lazy_property.py ├── tweet.py ├── tweet_checking.py ├── tweet_keys.py └── tweet_parser_errors.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # macosx 104 | .DS_Store 105 | 106 | # vim 107 | *.swp 108 | *.swo 109 | *~ 110 | -------------------------------------------------------------------------------- /.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdevplatform/tweet_parser/3435de8367d36b483a6cfd8d46cc28694ee8a42e/.nojekyll -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Twitter, Inc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Tweet Parser 2 | ============ 3 | 4 | Authors: `Fiona Pigott `__, `Jeff 5 | Kolb `__, `Josh 6 | Montague `__, `Aaron 7 | Gonzales `__ 8 | 9 | Goal: 10 | ----- 11 | 12 | Allow reliable parsing of Tweets delivered by the Gnip platform, in both 13 | activity-streams and original formats. 14 | 15 | Status: 16 | ------- 17 | 18 | This package can be installed by cloning the repo and using 19 | ``pip install -e .``, or by using ``pip install tweet_parser``. 20 | 21 | As of version 1.0.5, the package works with Python 2 and 3, and the 22 | API should be relatively stable. Recommended to use the more recent release. 23 | Current release is 1.13.2 24 | 25 | Currently, this parser does not explicitly support Public API Twitter 26 | data. 27 | 28 | Usage: 29 | ------ 30 | 31 | This package is intended to be used as a Python module inside your other 32 | Tweet-related code. An example Python program (after pip installing the 33 | package) would be: 34 | 35 | .. code:: python 36 | 37 | from tweet_parser.tweet import Tweet 38 | from tweet_parser.tweet_parser_errors import NotATweetError 39 | import fileinput 40 | import json 41 | 42 | for line in fileinput.FileInput("gnip_tweet_data.json"): 43 | try: 44 | tweet_dict = json.loads(line) 45 | tweet = Tweet(tweet_dict) 46 | except (json.JSONDecodeError,NotATweetError): 47 | pass 48 | print(tweet.created_at_string, tweet.all_text) 49 | 50 | I've also added simple command-line utility: 51 | 52 | .. code:: bash 53 | 54 | python tools/parse_tweets.py -f"gnip_tweet_data.json" -c"created_at_string,all_text" 55 | 56 | Testing: 57 | -------- 58 | 59 | A Python ``test_tweet_parser.py`` package exists in ``test/``. 60 | 61 | The most important thing that it tests is the equivalence of outputs 62 | when comparing both activity-streams input and original-format input. 63 | Any new getter will be tested by running 64 | ``test$ python test_tweet_parser.py``, as the test checks every method 65 | attached to the Tweet object, for every test tweet stored in 66 | ``test/tweet_payload_examples``. For any cases where it is expected that 67 | the outputs are different (e.g., outputs that depend on poll options), 68 | conditional statements should be added to this test. 69 | 70 | An option also exists for run-time checking of Tweet payload formats. 71 | This compares the set of all Tweet field keys to a superset of all 72 | possible keys, as well as a minimum set of all required keys, to make 73 | sure that each newly loaded Tweet fits those parameters. This shouldn't 74 | be run every time you load Tweets (for one, it's slow), but is 75 | implemented to use as a periodic check against Tweet format changes. 76 | This option is enabled with ``--do_format_validation`` on the command 77 | line, and by setting the keyword argument ``do_format_validation`` to 78 | ``True`` when initializing a ``Tweet`` object. 79 | 80 | Contributing 81 | ------------ 82 | 83 | Submit bug reports or feature requests through GitHub Issues, with 84 | self-contained minimum working examples where appropriate. 85 | 86 | To contribute code, fork this repo, create your own local feature 87 | branch, make your changes, test them, and submit a pull request to the 88 | master branch. The contribution guidelines specified in the ``pandas`` 89 | `documentation `__ 90 | are a great reference. 91 | 92 | When you submit a change, change the version number. For bug fixes and 93 | non-breaking changes that do not affect the top-level Tweet object API 94 | (fixing a bug or changing the internals of a getter while package naming/structure 95 | remains the same), increment the last number (X.Y.Z -> X.Y.Z+1) in 96 | ``setup.py``. For changes that do affect the top-level Tweet object API (e.g., adding a 97 | new getter), increment the middle number (X.Y.Z -> X.Y+1.0). 98 | 99 | Guidelines for new getters 100 | ~~~~~~~~~~~~~~~~~~~~~~~~~~ 101 | 102 | A *getter* is a method in the Tweet class and the accompanying code in 103 | the ``getter_methods`` module. A getter for some property should: 104 | 105 | - be named ````, a method in ``Tweet`` decorated with 106 | ``@lazy_property`` 107 | - have a corresponding method named 108 | ``get_(tweet)`` in the ``getter_methods`` module that 109 | implements the logic, nested uner the appropriate submodule (a text 110 | property probably lives under the ``getter_methods.tweet_text`` 111 | submodule) 112 | - provide the exact same output for original format and 113 | activity-streams format Tweet input, except in the case where certain 114 | information is unavailable (see ``get_poll_options``). 115 | 116 | In general, prefer that the ``get_`` work on a simple Tweet 117 | dictionary as well as a Tweet object (this makes unit testing easier). 118 | This means that you might use ``is_original_format(tweet)`` rather than 119 | ``tweet.is_original_format`` to check format inside of a getter. 120 | 121 | Adding unit tests for your getter in the docstrings in the "Example" 122 | section is helpful. See existing getters for examples. 123 | 124 | In general, make detailed docstrings with examples in 125 | ``get_``, and more concise dosctrings in ``Tweet``, with a 126 | reference for where to find the ``get_`` getter that 127 | implements the logic. 128 | 129 | Style 130 | ~~~~~ 131 | 132 | Adhere to the PEP8 style. Using a Python linter (like flake8) is 133 | reccomended. 134 | 135 | For documentation style, use `Google-style 136 | docstrings `__. 137 | Refer to the `Python docstest 138 | documentation `__ for 139 | doctest guidelines. 140 | 141 | Testing 142 | ~~~~~~~ 143 | 144 | Create an isolated virtual environment for testing (there are currently 145 | no external dependencies for this library). 146 | 147 | Test your new feature by reinstalling the library in your virtual 148 | environment and running the test script as shown below. Fix any issues 149 | until all tests pass. 150 | 151 | .. code-block:: bash 152 | 153 | (env) [tweet_parser]$ pip install -e . 154 | (env) [tweet_parser]$ cd test/; python test_tweet_parser.py; cd - 155 | 156 | Furthermore, if contributing a new accessor or getter method for payload 157 | elements, verify the code works as you intended by running the 158 | ``parse_tweets.py`` script with your new field, as shown below. Check 159 | that both input types produce the intended output. 160 | 161 | Note that FieldDeprecationWarnings will appear while testing for certain getters, this is expected behavior. 162 | 163 | .. code-block:: bash 164 | 165 | (env) [tweet_parser]$ pip install -e . 166 | (env) [tweet_parser]$ python tools/parse_tweets.py -f test/tweet_payload_examples/activity_streams_examples.json -c 167 | 168 | And lastly, if you've added new docstrings and doctests, from the 169 | ``docs`` directory, run ``make html`` (to check docstring formatting) 170 | and ``make doctest`` to run the doctests. 171 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /doc_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -ne 1 ]; then 4 | echo "Error: Please provide a branch name from which documentation will be built"; 5 | exit 1 6 | fi 7 | 8 | BRANCH_NAME=$1 9 | 10 | echo "Building documentation from $BRANCH_NAME" 11 | echo "checking out gh-pages" 12 | if ! git checkout gh-pages 13 | then 14 | echo >&2 "checkout of gh-pages branch failed; please ensure you have local changes commited prior to running this script " 15 | echo "exiting" 16 | exit 1 17 | fi 18 | 19 | pwd 20 | echo "removing current files" 21 | rm -rf *.egg-info 22 | 23 | git pull origin gh-pages 24 | # clean old doc build 25 | rm -r *.html *.js _sources/ _static/ 26 | 27 | # ensure .nojekyll is here 28 | touch .nojekyll 29 | 30 | # grab the correct stuff from the build branch 31 | git checkout $BRANCH_NAME docs tweet_parser README.rst 32 | 33 | mv docs/* . 34 | cp README.rst source/README.rst 35 | make html 36 | mv -fv build/html/* ./ 37 | rm -r tweet_parser docs build Makefile source __pycache__/ 38 | echo "--------------------------------------------------------------------" 39 | echo " docs built; please review these changes and then run the following:" 40 | echo "--------------------------------------------------------------------" 41 | echo git add -A 42 | echo git commit -m \"Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit | grep commit`\" 43 | echo git push origin gh-pages 44 | echo git checkout $BRANCH_NAME 45 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = TweetParser 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | GH_PAGES_SOURCES = docs tweet_parser 11 | 12 | # Put it first so that "make" without argument is like "make help". 13 | help: 14 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 15 | 16 | .PHONY: help Makefile 17 | 18 | # Catch-all target: route all unknown targets to Sphinx using the new 19 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 20 | %: Makefile 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | 23 | -------------------------------------------------------------------------------- /docs/readme.md: -------------------------------------------------------------------------------- 1 | ## Documentation 2 | We are using Sphinx with Google-style docstrings to build our documentation. It's a fairly straightforward process to build the docs locally to preview your changes. There is a script for deployment to gh-pages, described below. 3 | 4 | ### Setup 5 | 6 | We obviously require sphinx for this, but (sphinx_bootstrap)[https://github.com/ryan-roemer/sphinx-bootstrap-theme] is required for building the docs in Bootstrap. 7 | 8 | on osx: 9 | 10 | ```.bash 11 | pip install sphinx 12 | pip install recommonmark 13 | pip install sphinx-bootstrap-theme 14 | pip install sphinxcontrib-napoleon 15 | ``` 16 | 17 | ### Build 18 | 19 | This will build the docs locally for testing and future deployment. 20 | 21 | ```.bash 22 | cd tweet_parser/docs 23 | make clean 24 | make html 25 | ``` 26 | 27 | ### Deploying to github pages 28 | From the root of the repo run: 29 | 30 | ```.bash 31 | bash doc_build.sh 32 | ``` 33 | 34 | where `` is the name of the branch you'll be building from, most likely master. The script will change to the `gh-pages` branch, clean out the olds docs, pull your changes from the relevant branch, build them, and give you instructions for review and commands for deployment. 35 | -------------------------------------------------------------------------------- /docs/source/README.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../README.rst 2 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # Tweet Parser documentation build configuration file, created by 5 | # sphinx-quickstart on Wed Aug 9 13:44:53 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | 23 | sys.path.insert(0, os.path.abspath('../')) 24 | sys.path.insert(0, os.path.abspath('.')) 25 | 26 | 27 | import sphinx_bootstrap_theme 28 | 29 | # -- General configuration ------------------------------------------------ 30 | 31 | # If your documentation needs a minimal Sphinx version, state it here. 32 | # 33 | # needs_sphinx = '1.0' 34 | 35 | # Add any Sphinx extension module names here, as strings. They can be 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 37 | # ones. 38 | extensions = ['sphinx.ext.autodoc', 39 | 'sphinx.ext.doctest', 40 | 'sphinx.ext.intersphinx', 41 | 'sphinx.ext.todo', 42 | 'sphinx.ext.coverage', 43 | 'sphinx.ext.viewcode', 44 | 'sphinxcontrib.napoleon', 45 | 'sphinx.ext.githubpages'] 46 | 47 | 48 | # Add any paths that contain templates here, relative to this directory. 49 | templates_path = ['_templates'] 50 | 51 | # The suffix(es) of source filenames. 52 | # You can specify multiple suffix as a list of string: 53 | 54 | source_parsers = { 55 | '.md': 'recommonmark.parser.CommonMarkParser', 56 | } 57 | source_suffix = ['.rst', '.md'] 58 | 59 | 60 | # The master toctree document. 61 | master_doc = 'index' 62 | 63 | # General information about the project. 64 | project = 'Tweet Parser' 65 | copyright = '2017, Twitter DDIS' 66 | author = 'Twitter DDIS' 67 | 68 | # The version info for the project you're documenting, acts as replacement for 69 | # |version| and |release|, also used in various other places throughout the 70 | # built documents. 71 | # 72 | # The short X.Y version. 73 | version = '1' 74 | # The full version, including alpha/beta/rc tags. 75 | release = '1' 76 | 77 | # The language for content autogenerated by Sphinx. Refer to documentation 78 | # for a list of supported languages. 79 | # 80 | # This is also used if you do content translation via gettext catalogs. 81 | # Usually you set "language" from the command line for these cases. 82 | language = None 83 | 84 | # List of patterns, relative to source directory, that match files and 85 | # directories to ignore when looking for source files. 86 | # This patterns also effect to html_static_path and html_extra_path 87 | exclude_patterns = [] 88 | 89 | # The name of the Pygments (syntax highlighting) style to use. 90 | pygments_style = 'sphinx' 91 | 92 | # If true, `todo` and `todoList` produce output, else they produce nothing. 93 | todo_include_todos = False 94 | 95 | 96 | # -- Options for HTML output ---------------------------------------------- 97 | 98 | # The theme to use for HTML and HTML Help pages. See the documentation for 99 | # a list of builtin themes. 100 | # 101 | # Theme options are theme-specific and customize the look and feel of a theme 102 | # further. For a list of options available for each theme, see the 103 | # documentation. 104 | # 105 | # html_theme_options = {} 106 | 107 | # Add any paths that contain custom static files (such as style sheets) here, 108 | # relative to this directory. They are copied after the builtin static files, 109 | # so a file named "default.css" will overwrite the builtin "default.css". 110 | html_static_path = [] 111 | 112 | 113 | # html_theme = 'alabaster' 114 | # html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 115 | html_theme = 'bootstrap' 116 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 117 | 118 | html_theme_options = { 119 | # Navigation bar title. (Default: ``project`` value) 120 | 'navbar_title': "Tweet Parser", 121 | 122 | # Tab name for entire site. (Default: "Site") 123 | 'navbar_site_name': "Tweet Parser", 124 | 125 | # A list of tuples containing pages or urls to link to. 126 | # Valid tuples should be in the following forms: 127 | # (name, page) # a link to a page 128 | # (name, "/aa/bb", 1) # a link to an arbitrary relative url 129 | # (name, "http://example.com", True) # arbitrary absolute url 130 | # Note the "1" or "True" value above as the third argument to indicate 131 | # an arbitrary url. 132 | 'navbar_links': [ 133 | ("Github", "https://github.com/twitterdev/tweet_parser", True), 134 | ], 135 | 136 | # Render the next and previous page links in navbar. (Default: true) 137 | 'navbar_sidebarrel': True, 138 | 139 | # Render the current pages TOC in the navbar. (Default: true) 140 | 'navbar_pagenav': True, 141 | 142 | # Tab name for the current pages TOC. (Default: "Page") 143 | 'navbar_pagenav_name': "Page", 144 | 145 | # Global TOC depth for "site" navbar tab. (Default: 1) 146 | # Switching to -1 shows all levels. 147 | 'globaltoc_depth': 2, 148 | 149 | # Include hidden TOCs in Site navbar? 150 | # 151 | # Note: If this is "false", you cannot have mixed ``:hidden:`` and 152 | # non-hidden ``toctree`` directives in the same page, or else the build 153 | # will break. 154 | # 155 | # Values: "true" (default) or "false" 156 | 'globaltoc_includehidden': "true", 157 | 158 | # HTML navbar class (Default: "navbar") to attach to
element. 159 | # For black navbar, do "navbar navbar-inverse" 160 | 'navbar_class': "navbar navbar-inverse", 161 | 162 | # Fix navigation bar to top of page? 163 | # Values: "true" (default) or "false" 164 | 'navbar_fixed_top': "true", 165 | 166 | # Location of link to source. 167 | # Options are "nav" (default), "footer" or anything else to exclude. 168 | 'source_link_position': None, 169 | 170 | # Bootswatch (http://bootswatch.com/) theme. 171 | # 172 | # Options are nothing (default) or the name of a valid theme 173 | # such as "cosmo" or "sandstone". 174 | 'bootswatch_theme': "cosmo", 175 | 176 | # Choose Bootstrap version. 177 | # Values: "3" (default) or "2" (in quotes) 178 | 'bootstrap_version': "3", 179 | } 180 | 181 | # -- Options for HTMLHelp output ------------------------------------------ 182 | 183 | # Output file base name for HTML help builder. 184 | htmlhelp_basename = 'TweetParserdoc' 185 | 186 | 187 | # -- Options for LaTeX output --------------------------------------------- 188 | 189 | latex_elements = { 190 | # The paper size ('letterpaper' or 'a4paper'). 191 | # 192 | # 'papersize': 'letterpaper', 193 | 194 | # The font size ('10pt', '11pt' or '12pt'). 195 | # 196 | # 'pointsize': '11pt', 197 | 198 | # Additional stuff for the LaTeX preamble. 199 | # 200 | # 'preamble': '', 201 | 202 | # Latex figure (float) alignment 203 | # 204 | # 'figure_align': 'htbp', 205 | } 206 | 207 | # Grouping the document tree into LaTeX files. List of tuples 208 | # (source start file, target name, title, 209 | # author, documentclass [howto, manual, or own class]). 210 | latex_documents = [ 211 | (master_doc, 'TweetParser.tex', 'Tweet Parser Documentation', 212 | 'Twitter DDIS', 'manual'), 213 | ] 214 | 215 | 216 | # -- Options for manual page output --------------------------------------- 217 | 218 | # One entry per manual page. List of tuples 219 | # (source start file, name, description, authors, manual section). 220 | man_pages = [ 221 | (master_doc, 'tweetparser', 'Tweet Parser Documentation', 222 | [author], 1) 223 | ] 224 | 225 | 226 | # -- Options for Texinfo output ------------------------------------------- 227 | 228 | # Grouping the document tree into Texinfo files. List of tuples 229 | # (source start file, target name, title, author, 230 | # dir menu entry, description, category) 231 | texinfo_documents = [ 232 | (master_doc, 'TweetParser', 'Tweet Parser Documentation', 233 | author, 'TweetParser', 'One line description of project.', 234 | 'Miscellaneous'), 235 | ] 236 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. Tweet Parser documentation master file, created by 2 | sphinx-quickstart on Wed Aug 9 13:44:53 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | .. include:: README.rst 6 | 7 | 8 | .. toctree:: 9 | :maxdepth: 3 10 | :caption: Contents: 11 | 12 | self 13 | tweet_parser 14 | 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | 24 | 25 | -------------------------------------------------------------------------------- /docs/source/tweet_parser.getter_methods.rst: -------------------------------------------------------------------------------- 1 | tweet\_parser\.getter\_methods package 2 | ====================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | tweet\_parser\.getter\_methods\.gnip\_fields module 8 | --------------------------------------------------- 9 | 10 | .. automodule:: tweet_parser.getter_methods.gnip_fields 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | tweet\_parser\.getter\_methods\.tweet\_counts module 16 | ---------------------------------------------------- 17 | 18 | .. automodule:: tweet_parser.getter_methods.tweet_counts 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | tweet\_parser\.getter\_methods\.tweet\_date module 24 | -------------------------------------------------- 25 | 26 | .. automodule:: tweet_parser.getter_methods.tweet_date 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | tweet\_parser\.getter\_methods\.tweet\_embeds module 32 | ---------------------------------------------------- 33 | 34 | .. automodule:: tweet_parser.getter_methods.tweet_embeds 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | tweet\_parser\.getter\_methods\.tweet\_entities module 40 | ------------------------------------------------------ 41 | 42 | .. automodule:: tweet_parser.getter_methods.tweet_entities 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | tweet\_parser\.getter\_methods\.tweet\_generator module 48 | ------------------------------------------------------- 49 | 50 | .. automodule:: tweet_parser.getter_methods.tweet_generator 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | tweet\_parser\.getter\_methods\.tweet\_geo module 56 | ------------------------------------------------- 57 | 58 | .. automodule:: tweet_parser.getter_methods.tweet_geo 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | tweet\_parser\.getter\_methods\.tweet\_links module 64 | --------------------------------------------------- 65 | 66 | .. automodule:: tweet_parser.getter_methods.tweet_links 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | tweet\_parser\.getter\_methods\.tweet\_reply module 72 | --------------------------------------------------- 73 | 74 | .. automodule:: tweet_parser.getter_methods.tweet_reply 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | tweet\_parser\.getter\_methods\.tweet\_text module 80 | -------------------------------------------------- 81 | 82 | .. automodule:: tweet_parser.getter_methods.tweet_text 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | tweet\_parser\.getter\_methods\.tweet\_user module 88 | -------------------------------------------------- 89 | 90 | .. automodule:: tweet_parser.getter_methods.tweet_user 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | 96 | Module contents 97 | --------------- 98 | 99 | .. automodule:: tweet_parser.getter_methods 100 | :members: 101 | :undoc-members: 102 | :show-inheritance: 103 | -------------------------------------------------------------------------------- /docs/source/tweet_parser.rst: -------------------------------------------------------------------------------- 1 | tweet\_parser package 2 | ===================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | tweet_parser.getter_methods 10 | 11 | Submodules 12 | ---------- 13 | 14 | tweet\_parser\.lazy\_property module 15 | ------------------------------------ 16 | 17 | .. automodule:: tweet_parser.lazy_property 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | tweet\_parser\.tweet module 23 | --------------------------- 24 | 25 | .. automodule:: tweet_parser.tweet 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | tweet\_parser\.tweet\_checking module 31 | ------------------------------------- 32 | 33 | .. automodule:: tweet_parser.tweet_checking 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | tweet\_parser\.tweet\_keys module 39 | --------------------------------- 40 | 41 | .. automodule:: tweet_parser.tweet_keys 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | tweet\_parser\.tweet\_parser\_errors module 47 | ------------------------------------------- 48 | 49 | .. automodule:: tweet_parser.tweet_parser_errors 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | 55 | Module contents 56 | --------------- 57 | 58 | .. automodule:: tweet_parser 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | from setuptools import setup, find_packages 6 | 7 | setup(name='tweet_parser', 8 | description="Tools for Tweet parsing", 9 | url='https://github.com/twitterdev/tweet_parser', 10 | author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales', 11 | long_description=open('README.rst', 'r').read(), 12 | author_email='fpigott@twitter.com', 13 | license='MIT', 14 | version='1.13.2', 15 | packages=find_packages(), 16 | scripts=["tools/parse_tweets.py"], 17 | install_requires=[], 18 | ) 19 | -------------------------------------------------------------------------------- /test/test_tweet_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | import unittest 6 | import fileinput 7 | import json 8 | import warnings 9 | from tweet_parser.tweet import Tweet 10 | from tweet_parser import tweet_checking 11 | from tweet_parser.tweet_parser_errors import NotATweetError, NotAvailableError, UnexpectedFormatError 12 | from tweet_parser.deprecator import FieldDeprecationWarning 13 | 14 | def make_a_string(data): 15 | if type(data) == str: 16 | return data 17 | elif type(data) == set: 18 | return "{" + ", ".join(sorted(list(data))) + "}" 19 | else: 20 | return data.__repr__() 21 | 22 | 23 | class TestTweetMethods(unittest.TestCase): 24 | 25 | def setUp(self): 26 | tweet_payloads = {} 27 | tweet_payloads["activity_streams"] = {} 28 | tweet_payloads["original_format"] = {} 29 | tweet_ids = [] 30 | for line in fileinput.FileInput("tweet_payload_examples/activity_streams_examples.json"): 31 | tweet = Tweet(json.loads(line)) 32 | tweet_ids.append(tweet.id) 33 | tweet_payloads["activity_streams"][tweet.id] = tweet 34 | for line in fileinput.FileInput("tweet_payload_examples/original_format_examples.json"): 35 | tweet = Tweet(json.loads(line)) 36 | tweet_ids.append(tweet.id) 37 | tweet_payloads["original_format"][tweet.id] = tweet 38 | self.tweet_payloads = tweet_payloads 39 | self.tweet_ids = list(set(tweet_ids)) 40 | 41 | def test_equivalent_formats(self): 42 | list_of_attrs = sorted([x for x in list(set(dir(Tweet)) - set(dir(dict))) if x[0] != "_"]) 43 | for tweet_id in self.tweet_ids: 44 | # we know that we can't get polls in activity streams 45 | if self.tweet_payloads["original_format"][tweet_id].poll_options == []: 46 | for attr in list_of_attrs: 47 | try: 48 | orig = getattr(self.tweet_payloads["original_format"][tweet_id], attr) 49 | if type(orig) == Tweet: 50 | orig = orig.id 51 | except NotAvailableError as e: 52 | orig = e.__repr__() 53 | try: 54 | acti = getattr(self.tweet_payloads["activity_streams"][tweet_id], attr) 55 | if type(acti) == Tweet: 56 | acti = acti.id 57 | acti = acti 58 | except NotAvailableError as e: 59 | acti = e.__repr__() 60 | # for some reason the ["body"]/["text"] truncations are different in as vs og 61 | if attr == "text": 62 | orig = orig[0:100] 63 | acti = acti[0:100] 64 | if attr not in ["poll_options","in_reply_to_user_id","quote_count"]: # will raise an error in activity streams 65 | self.assertEqual(orig, acti) 66 | 67 | def test_bad_payloads(self): 68 | # missing the user field, raises a "NotATweetError" 69 | with self.assertRaises(NotATweetError): 70 | f = open("tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_user.json", "r") 71 | tweet = json.load(f) 72 | f.close() 73 | Tweet(tweet) 74 | # missing a different required field, raises "UnexpectedFormatError" 75 | with self.assertRaises(UnexpectedFormatError): 76 | f = open("tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_field.json", "r") 77 | tweet = json.load(f) 78 | f.close() 79 | Tweet(tweet, do_format_validation=True) 80 | # missing a different required field, raises "UnexpectedFormatError" 81 | with self.assertRaises(UnexpectedFormatError): 82 | f = open("tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_missing_field.json", "r") 83 | tweet = json.load(f) 84 | f.close() 85 | Tweet(tweet, do_format_validation=True) 86 | # added a new field, raises "UnexpectedFormatError" 87 | with self.assertRaises(UnexpectedFormatError): 88 | f = open("tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_additional_field.json", "r") 89 | tweet = json.load(f) 90 | f.close() 91 | Tweet(tweet, do_format_validation=True) 92 | # added a new field, raises "UnexpectedFormatError" 93 | with self.assertRaises(UnexpectedFormatError): 94 | f = open("tweet_payload_examples/broken_and_unsupported_payloads/original_format_additional_field.json", "r") 95 | tweet = json.load(f) 96 | f.close() 97 | Tweet(tweet, do_format_validation=True) 98 | # note: these tests aren't going to cover some kinds of malformed payloads (i.e., "quote tweet" section is missing fields) 99 | 100 | def test_check_format(self): 101 | superset = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10} 102 | minset = {2, 4, 6, 8, 10} 103 | too_many_keys = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11} 104 | too_few_keys = {2, 4, 6, 8} 105 | just_right = {1, 2, 4, 6, 8, 10} 106 | with self.assertRaises(UnexpectedFormatError) as exception: 107 | tweet_checking.key_validation_check(too_many_keys, superset, minset) 108 | with self.assertRaises(UnexpectedFormatError) as exception: 109 | tweet_checking.key_validation_check(too_few_keys, superset, minset) 110 | self.assertEqual(0, tweet_checking.key_validation_check(just_right, superset, minset)) 111 | 112 | def test_get_all_keys(self): 113 | # define a test nested dict: 114 | test_dict = {"a": {"b": "c", "d": {"e": "f", "g": "h"}}, "i": "j"} 115 | self.assertEqual(set(tweet_checking.get_all_keys(test_dict)), {"a b", "a d e", "a d g", "i"}) 116 | 117 | 118 | if __name__ == '__main__': 119 | #with warnings.catch_warnings(): 120 | # warnings.simplefilter("ignore", FieldDeprecationWarning) 121 | unittest.main() 122 | -------------------------------------------------------------------------------- /test/tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_additional_field.json: -------------------------------------------------------------------------------- 1 | {"extra_field":"stuff","lang":"en","id":867503895978754048,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","country_code":"CL"},"source":"Twitter Web Client<\/a>","geo":null,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"I) I almost forgot to include a poll Tweet!","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"filter_level":"low","entities":{"urls":[],"user_mentions":[],"hashtags":[],"polls":[{"duration_minutes":1440,"end_datetime":"Thu May 25 22:13:40 +0000 2017","options":[{"position":1,"text":"\ud83d\ude04"},{"position":2,"text":"\ud83e\udd10"},{"position":3,"text":"\ud83d\ude31"}]}],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867503895978754048","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955840,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","location":null,"profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http:\/\/klout.com\/topic\/id\/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http:\/\/klout.com\/topic\/id\/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http:\/\/klout.com\/topic\/id\/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http:\/\/klout.com\/topic\/id\/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http:\/\/klout.com\/topic\/id\/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http:\/\/klout.com\/topic\/id\/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http:\/\/klout.com\/topic\/id\/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http:\/\/klout.com\/topic\/id\/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http:\/\/klout.com\/topic\/id\/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http:\/\/klout.com\/topic\/id\/10000000000000000001"}],"profile_url":"http:\/\/klout.com\/user\/id\/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","description":"Niche millennial content aggregator | Just messing with Twitter bots | github: https:\/\/github.com\/fionapigott\/fiona-bot","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Wed May 24 22:13:40 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]} 2 | -------------------------------------------------------------------------------- /test/tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_missing_field.json: -------------------------------------------------------------------------------- 1 | {"twitter_lang":"en","actor":{"links":[{"rel":"me","href":null}],"favoritesCount":0,"displayName":"jk no","languages":["en"],"statusesCount":64,"friendsCount":0,"objectType":"person","image":"https:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","twitterTimeZone":null,"preferredUsername":"RobotPrincessFi","verified":false,"followersCount":2,"utcOffset":null,"id":"id:twitter.com:815279070241955840","link":"http:\/\/www.twitter.com\/RobotPrincessFi","postedTime":"2016-12-31T19:30:52.362Z","listedCount":7},"location":{"displayName":"Las Condes, Chile","twitter_country_code":"CL","link":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","name":"Las Condes","objectType":"place","twitter_place_type":"city","country_code":"Chile","geo":{"type":"Polygon","coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]]}},"body":"K) Let's try just one photo! \ud83d\udc30 https:\/\/t.co\/ubynnad49V","retweetCount":0,"id":"tag:search.twitter.com,2005:867834809732677634","objectType":"activity","postedTime":"2017-05-25T20:08:36.000Z","verb":"post","object":{"summary":"K) Let's try just one photo! \ud83d\udc30 https:\/\/t.co\/ubynnad49V","id":"object:search.twitter.com,2005:867834809732677634","objectType":"note","postedTime":"2017-05-25T20:08:36.000Z","link":"http:\/\/twitter.com\/RobotPrincessFi\/statuses\/867834809732677634"},"display_text_range":[0,30],"provider":{"link":"http:\/\/www.twitter.com","objectType":"service","displayName":"Twitter"},"link":"http:\/\/twitter.com\/RobotPrincessFi\/statuses\/867834809732677634","favoritesCount":0,"twitter_filter_level":"low","twitter_entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1","type":"photo","indices":[31,54],"url":"https:\/\/t.co\/ubynnad49V","id":867834752379826177,"id_str":"867834752379826177","display_url":"pic.twitter.com\/ubynnad49V","sizes":{"large":{"w":355,"resize":"fit","h":236},"thumb":{"w":150,"resize":"crop","h":150},"medium":{"w":355,"resize":"fit","h":236},"small":{"w":355,"resize":"fit","h":236}}}],"symbols":[]},"gnip":{"matching_rules":[{"tag":null}],"klout_score":17,"klout_profile":{"link":"http:\/\/klout.com\/user\/id\/133700650730839424","topics":[{"link":"http:\/\/klout.com\/topic\/id\/10000000000000016635","id":"10000000000000016635","displayName":"Technology","score":0.53,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/5227535270209280137","id":"5227535270209280137","displayName":"Latin","score":0.47,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/7225108339966145103","id":"7225108339966145103","displayName":"Beer","score":0.47,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000016634","id":"10000000000000016634","displayName":"Business","score":0.44,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000000001","id":"10000000000000000001","displayName":"Food and Drink","score":0.43,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000008253","id":"10000000000000008253","displayName":"Twitter","score":0.86,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/7783102141237674703","id":"7783102141237674703","displayName":"Media","score":0.71,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000019376","id":"10000000000000019376","displayName":"Emoji","score":0.5,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/9219221220892057324","id":"9219221220892057324","displayName":"C++","score":0.44,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/7003086526134829815","id":"7003086526134829815","displayName":"Boulder","score":0.43,"topic_type":"interest"}],"klout_user_id":"133700650730839424"}},"twitter_extended_entities":{"media":[{"media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1","type":"photo","indices":[31,54],"url":"https:\/\/t.co\/ubynnad49V","id":867834752379826177,"id_str":"867834752379826177","display_url":"pic.twitter.com\/ubynnad49V","sizes":{"large":{"w":355,"resize":"fit","h":236},"thumb":{"w":150,"resize":"crop","h":150},"medium":{"w":355,"resize":"fit","h":236},"small":{"w":355,"resize":"fit","h":236}}}]},"generator":{"link":"http:\/\/twitter.com","displayName":"Twitter Web Client"}} 2 | -------------------------------------------------------------------------------- /test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_additional_field.json: -------------------------------------------------------------------------------- 1 | {"unexpected_field":"blahhhhhh","display_text_range":[0,30],"lang":"en","id":867834809732677634,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","country_code":"CL"},"source":"Twitter Web Client<\/a>","geo":null,"possibly_sensitive":false,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"K) Let's try just one photo! \ud83d\udc30 https:\/\/t.co\/ubynnad49V","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"extended_entities":{"media":[{"indices":[31,54],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867834752379826177,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","url":"https:\/\/t.co\/ubynnad49V","type":"photo","display_url":"pic.twitter.com\/ubynnad49V","id_str":"867834752379826177","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1"}]},"filter_level":"low","entities":{"urls":[],"user_mentions":[],"media":[{"indices":[31,54],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867834752379826177,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","url":"https:\/\/t.co\/ubynnad49V","type":"photo","display_url":"pic.twitter.com\/ubynnad49V","id_str":"867834752379826177","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1"}],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867834809732677634","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955840,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","location":null,"profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http:\/\/klout.com\/topic\/id\/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http:\/\/klout.com\/topic\/id\/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http:\/\/klout.com\/topic\/id\/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http:\/\/klout.com\/topic\/id\/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http:\/\/klout.com\/topic\/id\/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http:\/\/klout.com\/topic\/id\/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http:\/\/klout.com\/topic\/id\/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http:\/\/klout.com\/topic\/id\/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http:\/\/klout.com\/topic\/id\/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http:\/\/klout.com\/topic\/id\/10000000000000000001"}],"profile_url":"http:\/\/klout.com\/user\/id\/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","description":"Niche millennial content aggregator | Just messing with Twitter bots | github: https:\/\/github.com\/fionapigott\/fiona-bot","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Thu May 25 20:08:36 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]} 2 | -------------------------------------------------------------------------------- /test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_field.json: -------------------------------------------------------------------------------- 1 | {"lang":"en","id":867468138991964200,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https://api.twitter.com/1.1/geo/id/00c4b64e7affea25.json","country_code":"CL"},"source":"Twitter Web Client","geo":null,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"A) This is a regular old Tweet. 🐣","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"filter_level":"low","entities":{"urls":[],"user_mentions":[],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867468138991964160","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955800,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","location":null,"profile_image_url_https":"https://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http://klout.com/topic/id/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http://klout.com/topic/id/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http://klout.com/topic/id/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http://klout.com/topic/id/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http://klout.com/topic/id/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http://klout.com/topic/id/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http://klout.com/topic/id/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http://klout.com/topic/id/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http://klout.com/topic/id/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http://klout.com/topic/id/10000000000000000001"}],"profile_url":"http://klout.com/user/id/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Wed May 24 19:51:35 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]} 2 | -------------------------------------------------------------------------------- /test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_quotetweet_field.json: -------------------------------------------------------------------------------- 1 | {"display_text_range":[0,140],"lang":"en","id":867479301360205800,"place":null,"source":"Twitter Web Client","geo":null,"possibly_sensitive":false,"truncated":true,"is_quote_status":true,"in_reply_to_status_id_str":null,"text":"Try using \"xxd\" on this 🙃:\n(Notice the \"joiner\" characters)\n👮‍♀️👩‍🚒👩‍🔧👩‍🏭👷‍♀️👨‍🚒👨‍🌾👨‍🍳👨‍🎤👩‍🎨👨‍💼👩‍🎓👨‍🏫👨‍🎨👩‍💻👩‍🔬👨‍🚀👩‍… https://t.co/QpqFmEDBLQ","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"quoted_status_id":861652051016663000,"in_reply_to_status_id":null,"extended_tweet":{"full_text":"Try using \"xxd\" on this 🙃:\n(Notice the \"joiner\" characters)\n👮‍♀️👩‍🚒👩‍🔧👩‍🏭👷‍♀️👨‍🚒👨‍🌾👨‍🍳👨‍🎤👩‍🎨👨‍💼👩‍🎓👨‍🏫👨‍🎨👩‍💻👩‍🔬👨‍🚀👩‍⚕️👨‍✈️👨‍⚖️🕵️\n 🧙‍♂️ Magic! https://t.co/5PU0FLFRYz","display_text_range":[0,140],"entities":{"urls":[{"unwound":{"description":"“Also, if you're on a Unix machine, try using \"xxd\" to see what your text is stored as. Copy-paste: $ echo \"🙄\" | xxd $ echo \"🙄\" | xxd -b”","status":200,"title":"👸🏼🙄 Fiona on Twitter","url":"https://twitter.com/notFromShrek/status/861652051016663040"},"indices":[141,164],"expanded_url":"https://twitter.com/notFromShrek/status/861652051016663040","display_url":"twitter.com/notFromShrek/s…","url":"https://t.co/5PU0FLFRYz"}],"user_mentions":[],"hashtags":[],"symbols":[]}},"filter_level":"low","quoted_status":{"lang":"en","id":861652051016663000,"place":{"bounding_box":{"coordinates":[[[-105.301776,39.953552],[-105.301776,40.094411],[-105.183597,40.094411],[-105.183597,39.953552]]],"type":"Polygon"},"id":"fd70c22040963ac7","attributes":{},"full_name":"Boulder, CO","name":"Boulder","place_type":"city","country":"United States","url":"https://api.twitter.com/1.1/geo/id/fd70c22040963ac7.json","country_code":"US"},"source":"Twitter Web Client","geo":null,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":"861651727614746624","text":"Also, if you're on a Unix machine, try using \"xxd\" to see what your text is stored as. Copy-paste:\n$ echo \"🙄\" | xxd\n$ echo \"🙄\" | xxd -b","favorite_count":0,"quote_count":0,"in_reply_to_user_id":2382763597,"in_reply_to_status_id":861651727614746600,"filter_level":"low","entities":{"urls":[],"user_mentions":[],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":"2382763597","id_str":"861652051016663040","reply_count":2,"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Mon May 08 18:40:32 +0000 2017","in_reply_to_screen_name":"notFromShrek","favorited":false},"in_reply_to_user_id_str":null,"id_str":"867479301360205824","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955800,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","location":null,"profile_image_url_https":"https://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http://klout.com/topic/id/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http://klout.com/topic/id/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http://klout.com/topic/id/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http://klout.com/topic/id/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http://klout.com/topic/id/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http://klout.com/topic/id/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http://klout.com/topic/id/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http://klout.com/topic/id/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http://klout.com/topic/id/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http://klout.com/topic/id/10000000000000000001"}],"profile_url":"http://klout.com/user/id/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","description":"Niche millennial content aggregator | Just messing with Twitter bots | github: https://github.com/fionapigott/fiona-bot","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"entities":{"urls":[{"indices":[117,140],"expanded_url":"https://twitter.com/i/web/status/867479301360205824","display_url":"twitter.com/i/web/status/8…","url":"https://t.co/QpqFmEDBLQ"}],"user_mentions":[],"hashtags":[],"symbols":[]},"created_at":"Wed May 24 20:35:57 +0000 2017","in_reply_to_screen_name":null,"quoted_status_id_str":"861652051016663040","favorited":false,"matching_rules":[{"tag":null}]} 2 | -------------------------------------------------------------------------------- /test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_user.json: -------------------------------------------------------------------------------- 1 | {"display_text_range":[0,48],"lang":"en","id":867833721579122688,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","country_code":"CL"},"source":"Twitter Web Client<\/a>","geo":null,"possibly_sensitive":false,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"J) I'm gonna include *two* photos in this Tweet! https:\/\/t.co\/iOGDJoWfME","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"extended_entities":{"media":[{"indices":[49,72],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867833378313293826,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","url":"https:\/\/t.co\/iOGDJoWfME","type":"photo","display_url":"pic.twitter.com\/iOGDJoWfME","id_str":"867833378313293826","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867833721579122688\/photo\/1"},{"indices":[49,72],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":423,"w":564},"small":{"resize":"fit","h":423,"w":564},"large":{"resize":"fit","h":423,"w":564}},"id":867833707989807104,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsqKNDXsAAgcYI.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsqKNDXsAAgcYI.jpg","url":"https:\/\/t.co\/iOGDJoWfME","type":"photo","display_url":"pic.twitter.com\/iOGDJoWfME","id_str":"867833707989807104","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867833721579122688\/photo\/1"}]},"filter_level":"low","entities":{"urls":[],"user_mentions":[],"media":[{"indices":[49,72],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867833378313293826,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","url":"https:\/\/t.co\/iOGDJoWfME","type":"photo","display_url":"pic.twitter.com\/iOGDJoWfME","id_str":"867833378313293826","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867833721579122688\/photo\/1"}],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867833721579122688","reply_count":0,"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Thu May 25 20:04:17 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]} 2 | -------------------------------------------------------------------------------- /tools/demo_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Notebook to demonstrate the tweet_parser module\n", 8 | "#### Fiona Pigott" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "collapsed": true 16 | }, 17 | "outputs": [], 18 | "source": [ 19 | "# import the tweet_parser module\n", 20 | "from tweet_parser.tweet import Tweet\n", 21 | "import fileinput\n", 22 | "import json" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "# import the exact same set of Tweets in both original format and activity streams format\n", 32 | "activity_streams_tweets = []\n", 33 | "for line in fileinput.FileInput(\"../test/tweet_payload_examples/activity_streams_examples.json\"):\n", 34 | " activity_streams_tweets.append(Tweet(json.loads(line)))\n", 35 | "\n", 36 | "original_format_tweets = []\n", 37 | "for line in fileinput.FileInput(\"../test/tweet_payload_examples/original_format_examples.json\"):\n", 38 | " original_format_tweets.append(Tweet(json.loads(line)))" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "print(\"Available methods: \\n - {}\".format(\"\\n - \".join([x for x in activity_streams_tweets[0].__dir__() if x[0] != \"_\"])))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "activity_streams_tweets[-4].tweet_links" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "for i,x in enumerate(activity_streams_tweets):\n", 66 | " print(x.id, x.tweet_type, x.tweet_links)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "for i,x in enumerate(original_format_tweets):\n", 76 | " print(i, \":\", x.all_text)\n", 77 | " print(\"##########\")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "quote_ception = original_format_tweets[2]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "quote_ception.hashtags" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "for x in activity_streams_tweets:\n", 107 | " print(x.user_mentions_ids)\n", 108 | " print(\"##########\")" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "for x in activity_streams_tweets:\n", 118 | " print(x.all_text)\n", 119 | " print(\"##########\")" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "activity_streams_tweets[16].quoted_user" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "activity_streams_tweets[16].quote_tweet.user_mentions" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "scrolled": false 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "for i,x in enumerate(activity_streams_tweets):\n", 149 | " print(i,x.quoted_mentions)\n", 150 | " print(\"##########\")" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "for x in original_format_tweets:\n", 160 | " print(x.all_text_without_links)\n", 161 | " print(\"##########\")" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "for x in original_format_tweets:\n", 171 | " print(x.most_unrolled_urls)\n", 172 | " print(\"##########\")" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "for x in original_format_tweets:\n", 182 | " print(x.hashtags)\n", 183 | " print(\"##########\")" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "Tweet({\"thing\":\"th\"})" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "malformed_quotetweet = Tweet(json.load(\n", 202 | " open(\"tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_quotetweet_field.json\",\"r\")),\n", 203 | " do_format_checking = True)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "malformed_quotetweet.embedded_tweet" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [] 223 | } 224 | ], 225 | "metadata": { 226 | "kernelspec": { 227 | "display_name": "Python 3", 228 | "language": "python", 229 | "name": "python3" 230 | }, 231 | "language_info": { 232 | "codemirror_mode": { 233 | "name": "ipython", 234 | "version": 3 235 | }, 236 | "file_extension": ".py", 237 | "mimetype": "text/x-python", 238 | "name": "python", 239 | "nbconvert_exporter": "python", 240 | "pygments_lexer": "ipython3", 241 | "version": "3.6.1" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 2 246 | } 247 | -------------------------------------------------------------------------------- /tools/parse_tweets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | 6 | #!/usr/bin/env python 7 | 8 | from tweet_parser.tweet import Tweet 9 | from tweet_parser.tweet_parser_errors import NotATweetError, NotAvailableError 10 | import argparse 11 | import fileinput 12 | import sys 13 | try: 14 | import ujson as json 15 | JSONDecodeError = ValueError 16 | except ImportError: 17 | import json 18 | if (sys.version_info[1] >= 5) and (sys.version_info[0] == 3): 19 | JSONDecodeError = json.JSONDecodeError 20 | else: 21 | JSONDecodeError = ValueError 22 | 23 | parser = argparse.ArgumentParser( 24 | description="Parse seqeunce of JSON formated activities.", formatter_class=argparse.RawTextHelpFormatter) 25 | parser.add_argument("-f", "--file", dest="data_files", 26 | default="-", 27 | help="Name of the file to read from, defaults to stdin") 28 | list_of_attrs = sorted([x for x in list(set(dir(Tweet)) - set(dir(dict))) if x[0] != "_"]) 29 | parser.add_argument("-c", "--csv", dest="func_list", 30 | default="id", 31 | help="comma separated list of attibutes to get \n possible functions include: \n -> {}".format(" \n -> ".join(list_of_attrs))) 32 | parser.add_argument("-d", "--delim", dest="delim", 33 | default="|", 34 | help="delimiter for the output csv, defaults to pipe") 35 | parser.add_argument("-z", "--compressed", action="store_true", dest="compressed", 36 | default=False, 37 | help="use this flag if data is compressed") 38 | parser.add_argument("-j", "--pass_bad_json", action="store_true", dest="pass_bad_json", 39 | default=False, 40 | help="use this flag to silently pass bad JSON payloads") 41 | parser.add_argument("-t", "--pass_non_tweet", action="store_true", dest="pass_non_tweet", 42 | default=False, 43 | help="use this flag to silently pass on non-tweet payloads") 44 | parser.add_argument("-a", "--pass_not_available", action="store_true", dest="pass_not_available", 45 | default=False, 46 | help="use this flag to silently pass on non-tweet payloads") 47 | parser.add_argument("--do_format_validation", action="store_true", dest="do_format_validation", 48 | default=False, 49 | help="debug formatting") 50 | options = parser.parse_args() 51 | 52 | # get the functions that we need to use: 53 | functions = options.func_list.split(",") 54 | 55 | # get the compression 56 | if options.compressed: 57 | openhook = fileinput.hook_compressed 58 | else: 59 | openhook = None 60 | # parse some tweets 61 | for line in fileinput.FileInput(options.data_files, openhook=openhook): 62 | csv = [] 63 | # load the JSON 64 | try: 65 | tweet_dict = json.loads(line) 66 | except JSONDecodeError as json_error: 67 | if not options.pass_bad_json: 68 | sys.stderr.write("{}. Use the flag '-j' to pass silently next time.\nBad JSON payload: {}".format(json_error, line)) 69 | continue 70 | # load a Tweet 71 | try: 72 | tweet_obj = Tweet(tweet_dict, do_format_validation=options.do_format_validation) 73 | except NotATweetError as nate: 74 | if not options.pass_non_tweet: 75 | sys.stderr.write("{}. Use the flag '-t' to pass silently next time.\nNon Tweet payload: {}".format(nate, line)) 76 | continue 77 | # get the relevant fields 78 | for func in functions: 79 | try: 80 | attribute = getattr(tweet_obj, func) 81 | if sys.version_info[0] == 3: 82 | csv.append(str(attribute)) 83 | else: 84 | if isinstance(attribute, str) or isinstance(attribute, unicode): 85 | csv.append(attribute.encode("utf-8")) 86 | else: 87 | csv.append(str(attribute)) 88 | except NotAvailableError as nae: 89 | if not options.pass_not_available: 90 | sys.stderr.write("{}. Use the flag -a to pass silently next time.\nAttribute Unavailable: {}".format(nae, line)) 91 | csv.append("NOT_AVAILABLE") 92 | sys.stdout.write(options.delim.join(csv) + "\n") 93 | -------------------------------------------------------------------------------- /tweet_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdevplatform/tweet_parser/3435de8367d36b483a6cfd8d46cc28694ee8a42e/tweet_parser/__init__.py -------------------------------------------------------------------------------- /tweet_parser/deprecator.py: -------------------------------------------------------------------------------- 1 | # https://stackoverflow.com/questions/2536307/decorators-in-the-python-standard-lib-deprecated-specifically 2 | 3 | import functools 4 | import inspect 5 | import warnings 6 | 7 | string_types = (type(b''), type(u'')) 8 | 9 | class FieldDeprecationWarning(Warning): 10 | pass 11 | 12 | def deprecated(reason): 13 | """ 14 | This is a decorator which can be used to mark functions 15 | as deprecated. It will result in a warning being emitted 16 | when the function is used. 17 | """ 18 | 19 | if isinstance(reason, string_types): 20 | 21 | # The @deprecated is used with a 'reason'. 22 | # 23 | # .. code-block:: python 24 | # 25 | # @deprecated("please, use another function") 26 | # def old_function(x, y): 27 | # pass 28 | 29 | def decorator(func1): 30 | 31 | if inspect.isclass(func1): 32 | fmt1 = "Call to deprecated class {name} ({reason})." 33 | else: 34 | fmt1 = "Call to deprecated function {name} ({reason})." 35 | 36 | @functools.wraps(func1) 37 | def new_func1(*args, **kwargs): 38 | #warnings.simplefilter('default', DeprecationWarning) 39 | warnings.warn( 40 | fmt1.format(name=func1.__name__, reason=reason), 41 | category=FieldDeprecationWarning, 42 | stacklevel=2 43 | ) 44 | #warnings.simplefilter('default', DeprecationWarning) 45 | return func1(*args, **kwargs) 46 | 47 | return new_func1 48 | 49 | return decorator 50 | 51 | elif inspect.isclass(reason) or inspect.isfunction(reason): 52 | 53 | # The @deprecated is used without any 'reason'. 54 | # 55 | # .. code-block:: python 56 | # 57 | # @deprecated 58 | # def old_function(x, y): 59 | # pass 60 | 61 | func2 = reason 62 | 63 | if inspect.isclass(func2): 64 | fmt2 = "Call to deprecated class {name}." 65 | else: 66 | fmt2 = "Call to deprecated function {name}." 67 | 68 | @functools.wraps(func2) 69 | def new_func2(*args, **kwargs): 70 | #warnings.simplefilter('default', DeprecationWarning) 71 | warnings.warn( 72 | fmt2.format(name=func2.__name__), 73 | category=FieldDeprecationWarning, 74 | stacklevel=2 75 | ) 76 | #warnings.simplefilter('default', DeprecationWarning) 77 | return func2(*args, **kwargs) 78 | 79 | return new_func2 80 | 81 | else: 82 | raise TypeError(repr(type(reason))) 83 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xdevplatform/tweet_parser/3435de8367d36b483a6cfd8d46cc28694ee8a42e/tweet_parser/getter_methods/__init__.py -------------------------------------------------------------------------------- /tweet_parser/getter_methods/gnip_fields.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | 6 | 7 | def get_matching_rules(tweet): 8 | """ 9 | Retrieves the matching rules for a tweet with a gnip field enrichment. 10 | 11 | Args: 12 | tweet (Tweet): the tweet 13 | 14 | Returns: 15 | list: potential ``[{"tag": "user_tag", "value": "rule_value"}]`` 16 | pairs from standard rulesets or None if no rules or no 17 | matching_rules field is found. \n 18 | More information on this value at: 19 | http://support.gnip.com/enrichments/matching_rules.html 20 | 21 | """ 22 | if is_original_format(tweet): 23 | rules = tweet.get("matching_rules") 24 | else: 25 | gnip = tweet.get("gnip") 26 | rules = gnip.get("matching_rules") if gnip else None 27 | return rules 28 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_counts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | """Tweet counts and related attributes 6 | 7 | This module holds attributes related to basic counts on tweets, such as 8 | retweets, favs, and quotes. It is unlikely to be extended. 9 | """ 10 | 11 | from tweet_parser.tweet_checking import is_original_format 12 | from tweet_parser.tweet_parser_errors import NotAvailableError 13 | 14 | def get_retweet_count(tweet): 15 | """ 16 | Gets the retweet count for this tweet. 17 | 18 | Args: 19 | tweet (Tweet): A Tweet object (or a dictionary) 20 | 21 | Returns: 22 | int: The number of times the Tweet has been retweeted 23 | 24 | Example: 25 | >>> from tweet_parser.getter_methods.tweet_counts import get_retweet_count 26 | >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z', 27 | ... 'id_str': '2382763597', 28 | ... 'retweet_count': 2} 29 | >>> get_retweet_count(tweet) 30 | 2 31 | 32 | >>> activity_streams_tweet = {'postedTime': '2017-05-24T20:17:19.000Z', 33 | ... 'retweetCount': 3} 34 | >>> get_retweet_count(activity_streams_tweet) 35 | 3 36 | """ 37 | if is_original_format(tweet): 38 | return tweet.get("retweet_count", 0) 39 | else: 40 | return tweet.get("retweetCount", 0) 41 | 42 | 43 | def get_favorite_count(tweet): 44 | """ 45 | Gets the favorite count for this tweet. 46 | 47 | Args: 48 | tweet (Tweet): A Tweet object (or a dictionary) 49 | 50 | Returns: 51 | int: The number of times the Tweet has been favorited 52 | 53 | Example: 54 | >>> from tweet_parser.getter_methods.tweet_counts import get_favorite_count 55 | >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z', 56 | ... 'id_str': '2382763597', 57 | ... 'favorite_count': 2} 58 | >>> get_favorite_count(tweet) 59 | 2 60 | 61 | >>> activity_streams_tweet = {'postedTime': '2017-05-24T20:17:19.000Z', 62 | ... 'favoritesCount': 3} 63 | >>> get_favorite_count(activity_streams_tweet) 64 | 3 65 | """ 66 | if is_original_format(tweet): 67 | return tweet.get("favorite_count", 0) 68 | else: 69 | return tweet.get("favoritesCount", 0) 70 | 71 | 72 | def get_quote_count(tweet): 73 | """ 74 | Gets the quote count for this tweet. \n 75 | Note that this is unavailable in activity-streams format 76 | 77 | Args: 78 | tweet (Tweet): A Tweet object (or a dictionary) 79 | 80 | Returns: 81 | int: The number of times the Tweet has been quoted 82 | or for activity-streams raise a NotAvailableError 83 | 84 | Example: 85 | >>> from tweet_parser.getter_methods.tweet_counts import get_quote_count 86 | >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z', 87 | ... 'id_str': '2382763597', 88 | ... 'quote_count': 2} 89 | >>> get_quote_count(tweet) 90 | 2 91 | """ 92 | if is_original_format(tweet): 93 | return tweet.get("quote_count", 0) 94 | else: 95 | raise NotAvailableError("Quote counts are only available in original format") 96 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_date.py: -------------------------------------------------------------------------------- 1 | # Twitter Snowflake ID to timestamp (and back) 2 | # https://github.com/client9/snowflake2time/ 3 | # Nick Galbreath @ngalbreath nickg@client9.com 4 | # Public Domain -- No Copyright -- Cut-n-Paste 5 | 6 | 7 | def snowflake2utc(sf): 8 | """ 9 | Convert a Twitter snowflake ID to a Unix timestamp 10 | (seconds since Jan 1 1970 00:00:00) 11 | 12 | Args: 13 | sf (str): Twitter snowflake ID as a string 14 | 15 | Returns: 16 | int: seconds since Jan 1 1970 00:00:00 17 | """ 18 | sf_int = int(sf) 19 | return int(((sf_int >> 22) + 1288834974657) / 1000.0) 20 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_embeds.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | from tweet_parser.getter_methods.tweet_text import get_tweet_type 6 | 7 | 8 | def get_quoted_tweet(tweet): 9 | """ 10 | Get the quoted Tweet and return it as a dictionary 11 | If the Tweet is not a quote Tweet, return None 12 | 13 | Args: 14 | tweet (Tweet or dict): A Tweet object or a dictionary 15 | 16 | Returns: 17 | dict: A dictionary representing the quoted status 18 | or None if there is no quoted status. \n 19 | - For original format, this is the value of "quoted_status" \n 20 | - For activity streams, this is the value of "twitter_quoted_status" 21 | """ 22 | if get_tweet_type(tweet) == "quote": 23 | if is_original_format(tweet): 24 | return tweet["quoted_status"] 25 | else: 26 | return tweet["twitter_quoted_status"] 27 | 28 | else: 29 | return None 30 | 31 | 32 | def get_retweeted_tweet(tweet): 33 | """ 34 | Get the retweeted Tweet and return it as a dictionary 35 | If the Tweet is not a Retweet, return None 36 | 37 | Args: 38 | tweet (Tweet or dict): A Tweet object or a dictionary 39 | 40 | Returns: 41 | dict: A dictionary representing the retweeted status 42 | or None if there is no quoted status. \n 43 | - For original format, this is the value of "retweeted_status" \n 44 | - For activity streams, If the Tweet is a Retweet this is the value of the key "object" 45 | """ 46 | if get_tweet_type(tweet) == "retweet": 47 | if is_original_format(tweet): 48 | return tweet["retweeted_status"] 49 | else: 50 | return tweet["object"] 51 | else: 52 | return None 53 | 54 | 55 | def get_embedded_tweet(tweet): 56 | """ 57 | Get the retweeted Tweet OR the quoted Tweet and return it as a dictionary 58 | 59 | Args: 60 | tweet (Tweet): A Tweet object (not simply a dict) 61 | 62 | Returns: 63 | dict (or None, if the Tweet is neither a quote tweet or a Retweet): 64 | a dictionary representing the quote Tweet or the Retweet 65 | """ 66 | if tweet.retweeted_tweet is not None: 67 | return tweet.retweeted_tweet 68 | elif tweet.quoted_tweet is not None: 69 | return tweet.quoted_tweet 70 | else: 71 | return None 72 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_entities.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | from tweet_parser.getter_methods.tweet_embeds import get_retweeted_tweet 6 | from tweet_parser.getter_methods.tweet_text import get_tweet_type 7 | 8 | def get_entities(tweet): 9 | """ 10 | Helper function to simply grabbing the entities. \n 11 | Caveat: In the case of Retweets, a Retweet is stored as 12 | "RT @someone: Some awesome status". In the case where pre-appending 13 | the string "RT @someone:" causes the Tweet to exceed 140 characters, 14 | entites (hashtags, mentions, urls) beyond the 140 character mark are 15 | excluded from the Retweet's entities. This seems like counterintuitive 16 | behavior, so we ensure here that the entities of a Retweet are a 17 | superset of the entities of the Retweeted status. 18 | 19 | Args: 20 | tweet (Tweet or dict): Tweet in question 21 | 22 | Returns: 23 | dict: dictionary of potential entities. 24 | 25 | Example: 26 | >>> from tweet_parser.getter_methods.tweet_entities import get_entities 27 | >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017", 28 | ... "entities": {"user_mentions": [{ 29 | ... "indices": [14,26], #characters where the @ mention appears 30 | ... "id_str": "2382763597", #id of @ mentioned user as a string 31 | ... "screen_name": "notFromShrek", #screen_name of @ mentioned user 32 | ... "name": "Fiona", #display name of @ mentioned user 33 | ... "id": 2382763597 #id of @ mentioned user as an int 34 | ... }] 35 | ... } 36 | ... } 37 | >>> get_entities(original) 38 | {'user_mentions': [{'indices': [14, 26], 'id_str': '2382763597', 'screen_name': 'notFromShrek', 'name': 'Fiona', 'id': 2382763597}]} 39 | """ 40 | 41 | entity_key = "entities" if is_original_format(tweet) else "twitter_entities" 42 | if get_tweet_type(tweet) == "retweet": 43 | retweet_entities = tweet.get(entity_key, []) 44 | all_entities = get_retweeted_tweet(tweet).get(entity_key,[]).copy() 45 | # the only thing that the Retweet will have that the Retweeted Tweet 46 | # won't have is the @-mention of the RTd user at the front ("RT @someone:") 47 | # I'm going to add that in, so the the Retweet's entities are a superset 48 | # of the RTd Tweet's entites 49 | all_entities["user_mentions"] = ([retweet_entities["user_mentions"][0]] + 50 | all_entities["user_mentions"]) 51 | return all_entities 52 | else: 53 | return tweet.get(entity_key, []) 54 | 55 | 56 | def get_media_entities(tweet): 57 | """ 58 | Grabs all the media entities from a tweet, which are contained in the 59 | "extended_entities" or "twitter_extended_entities" field depending on the 60 | tweet format. Note that this is not the same as the first media entity from 61 | the basic `entities` key; this is required to get *all* of the potential 62 | media contained within a tweet. This is useful as an entry point for other 63 | functions or for any custom parsing that needs to be done. 64 | 65 | Args: 66 | tweet (Tweet or dict): the tweet in question 67 | 68 | Returns: 69 | list or None: the list of dicts containing each media's metadata in the 70 | tweet. 71 | 72 | Example: 73 | >>> from tweet_parser.getter_methods.tweet_entities import get_media_entities 74 | >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z', 75 | ... 'entities': {'user_mentions': [{'id': 2382763597, 76 | ... 'id_str': '2382763597', 77 | ... 'indices': [14, 26], 78 | ... 'name': 'Fiona', 79 | ... 'screen_name': 'notFromShrek'}]}, 80 | ... 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/something', 81 | ... 'expanded_url': 'https://twitter.com/something', 82 | ... 'id': 4242, 83 | ... 'id_str': '4242', 84 | ... 'indices': [88, 111], 85 | ... 'media_url': 'http://pbs.twimg.com/media/something.jpg', 86 | ... 'media_url_https': 'https://pbs.twimg.com/media/something.jpg', 87 | ... 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 88 | ... 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 89 | ... 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 90 | ... 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 91 | ... 'type': 'photo', 92 | ... 'url': 'https://t.co/something'}, 93 | ... {'display_url': 'pic.twitter.com/something_else', 94 | ... 'expanded_url': 'https://twitter.com/user/status/something/photo/1', 95 | ... 'id': 4243, 96 | ... 'id_str': '4243', 97 | ... 'indices': [88, 111], 98 | ... 'media_url': 'http://pbs.twimg.com/media/something_else.jpg', 99 | ... 'media_url_https': 'https://pbs.twimg.com/media/something_else.jpg', 100 | ... 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 101 | ... 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 102 | ... 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 103 | ... 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 104 | ... 'type': 'photo', 105 | ... 'url': 'https://t.co/something_else'}]} 106 | ... } 107 | >>> get_media_entities(tweet) 108 | [{'display_url': 'pic.twitter.com/something', 'expanded_url': 'https://twitter.com/something', 'id': 4242, 'id_str': '4242', 'indices': [88, 111], 'media_url': 'http://pbs.twimg.com/media/something.jpg', 'media_url_https': 'https://pbs.twimg.com/media/something.jpg', 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 'type': 'photo', 'url': 'https://t.co/something'}, {'display_url': 'pic.twitter.com/something_else', 'expanded_url': 'https://twitter.com/user/status/something/photo/1', 'id': 4243, 'id_str': '4243', 'indices': [88, 111], 'media_url': 'http://pbs.twimg.com/media/something_else.jpg', 'media_url_https': 'https://pbs.twimg.com/media/something_else.jpg', 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 'type': 'photo', 'url': 'https://t.co/something_else'}] 109 | """ 110 | 111 | ext_ents_key = "extended_entities" if is_original_format(tweet) else "twitter_extended_entities" 112 | ext_ents = tweet.get(ext_ents_key) 113 | media = ext_ents.get("media", []) if ext_ents else [] 114 | return media 115 | 116 | 117 | def get_media_urls(tweet): 118 | """ 119 | Gets the https links to each media entity in the tweet. 120 | 121 | Args: 122 | tweet (Tweet or dict): tweet 123 | 124 | Returns: 125 | list: list of urls. Will be an empty list if there are no urls present. 126 | 127 | Example: 128 | >>> from tweet_parser.getter_methods.tweet_entities import get_media_urls 129 | >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z', 130 | ... 'entities': {'user_mentions': [{'id': 2382763597, 131 | ... 'id_str': '2382763597', 132 | ... 'indices': [14, 26], 133 | ... 'name': 'Fiona', 134 | ... 'screen_name': 'notFromShrek'}]}, 135 | ... 'extended_entities': {'media': [{'display_url': 'pic.twitter.com/something', 136 | ... 'expanded_url': 'https://twitter.com/something', 137 | ... 'id': 4242, 138 | ... 'id_str': '4242', 139 | ... 'indices': [88, 111], 140 | ... 'media_url': 'http://pbs.twimg.com/media/something.jpg', 141 | ... 'media_url_https': 'https://pbs.twimg.com/media/something.jpg', 142 | ... 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 143 | ... 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 144 | ... 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 145 | ... 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 146 | ... 'type': 'photo', 147 | ... 'url': 'https://t.co/something'}, 148 | ... {'display_url': 'pic.twitter.com/something_else', 149 | ... 'expanded_url': 'https://twitter.com/user/status/something/photo/1', 150 | ... 'id': 4243, 151 | ... 'id_str': '4243', 152 | ... 'indices': [88, 111], 153 | ... 'media_url': 'http://pbs.twimg.com/media/something_else.jpg', 154 | ... 'media_url_https': 'https://pbs.twimg.com/media/something_else.jpg', 155 | ... 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 156 | ... 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 157 | ... 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 158 | ... 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 159 | ... 'type': 'photo', 160 | ... 'url': 'https://t.co/something_else'}]} 161 | ... } 162 | >>> get_media_urls(tweet) 163 | ['https://pbs.twimg.com/media/something.jpg', 'https://pbs.twimg.com/media/something_else.jpg'] 164 | """ 165 | 166 | media = get_media_entities(tweet) 167 | urls = [m.get("media_url_https") for m in media] if media else [] 168 | return urls 169 | 170 | 171 | 172 | def get_user_mentions(tweet): 173 | """ 174 | Get the @-mentions in the Tweet as dictionaries. 175 | Note that in the case of a quote-tweet, this does not return the users 176 | mentioned in the quoted status. The recommended way to get that list would 177 | be to use get_user_mentions on the quoted status. 178 | Also note that in the caes of a quote-tweet, the list of @-mentioned users 179 | does not include the user who authored the original (quoted) Tweet. 180 | 181 | Args: 182 | tweet (Tweet or dict): A Tweet object or dictionary 183 | 184 | Returns: 185 | list (list of dicts): 1 item per @ mention. Note that the fields here 186 | aren't enforced by the parser, they are simply the fields as they 187 | appear in a Tweet data payload. 188 | 189 | Example: 190 | >>> from tweet_parser.getter_methods.tweet_entities import get_user_mentions 191 | >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017", 192 | ... "text": "RT @notFromShrek: Stuff! Words! ...", 193 | ... "entities": {"user_mentions": [{ 194 | ... "indices": [2,12], #characters where the @ mention appears 195 | ... "id_str": "2382763597", #id of @ mentioned user as a string 196 | ... "screen_name": "notFromShrek", #screen_name of @d user 197 | ... "name": "Fiona", #display name of @ mentioned user 198 | ... "id": 2382763597 #id of @ mentioned user as an int 199 | ... }] 200 | ... }, 201 | ... "retweeted_status": { 202 | ... "created_at": "Wed May 24 20:01:19 +0000 2017", 203 | ... "text": "Stuff! Words! #Tweeting!", 204 | ... "entities": {"user_mentions": []} 205 | ... } 206 | ... } 207 | >>> get_user_mentions(original) 208 | [{'indices': [2, 12], 'id_str': '2382763597', 'screen_name': 'notFromShrek', 'name': 'Fiona', 'id': 2382763597}] 209 | """ 210 | entities = get_entities(tweet) 211 | user_mentions = entities.get("user_mentions") if entities else None 212 | return user_mentions if user_mentions else [] 213 | 214 | 215 | def get_hashtags(tweet): 216 | """ 217 | Get a list of hashtags in the Tweet 218 | Note that in the case of a quote-tweet, this does not return the 219 | hashtags in the quoted status. 220 | 221 | Args: 222 | tweet (Tweet or dict): A Tweet object or dictionary 223 | 224 | Returns: 225 | list (a list of strings): list of all of the hashtags in the Tweet 226 | 227 | Example: 228 | >>> from tweet_parser.getter_methods.tweet_entities import get_hashtags 229 | >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017", 230 | ... "entities": {"hashtags": [{"text":"1hashtag"}]}} 231 | >>> get_hashtags(original) 232 | ['1hashtag'] 233 | 234 | >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z", 235 | ... "verb": "post", 236 | ... "twitter_entities": {"hashtags": [ 237 | ... {"text":"1hashtag"}, 238 | ... {"text": "moreHashtags"}]}} 239 | >>> get_hashtags(activity) 240 | ['1hashtag', 'moreHashtags'] 241 | """ 242 | entities = get_entities(tweet) 243 | hashtags = entities.get("hashtags") 244 | hashtags = [tag["text"] for tag in hashtags] if hashtags else [] 245 | return hashtags 246 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | import sys 6 | if sys.version_info[0] == 3: 7 | from html.parser import HTMLParser 8 | elif sys.version_info[0] == 2: 9 | from HTMLParser import HTMLParser 10 | 11 | class GeneratorHTMLParser(HTMLParser): 12 | """ 13 | HTML parser class to handle HTML tags in the original format source field 14 | """ 15 | def handle_starttag(self, tag, attrs): 16 | for attr in attrs: 17 | if attr[0] == "href": 18 | self.generator_link = attr[1] 19 | 20 | def handle_data(self, data): 21 | self.generator_name = data 22 | 23 | 24 | def get_generator(tweet): 25 | """ 26 | Get information about the application that generated the Tweet 27 | 28 | Args: 29 | tweet (Tweet): A Tweet object (or a dictionary) 30 | 31 | Returns: 32 | dict: keys are 'link' and 'name', the web link and the name 33 | of the application 34 | 35 | Example: 36 | >>> from tweet_parser.getter_methods.tweet_generator import get_generator 37 | >>> original_format_dict = { 38 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 39 | ... "source": 'Twitter Web Client' 40 | ... } 41 | >>> get_generator(original_format_dict) 42 | {'link': 'http://twitter.com', 'name': 'Twitter Web Client'} 43 | 44 | >>> activity_streams_format_dict = { 45 | ... "postedTime": "2017-05-24T20:17:19.000Z", 46 | ... "generator": 47 | ... {"link": "http://twitter.com", 48 | ... "displayName": "Twitter Web Client"} 49 | ... } 50 | >>> get_generator(activity_streams_format_dict) 51 | {'link': 'http://twitter.com', 'name': 'Twitter Web Client'} 52 | """ 53 | if is_original_format(tweet): 54 | if sys.version_info[0] == 3 and sys.version_info[1] >= 4: 55 | parser = GeneratorHTMLParser(convert_charrefs=True) 56 | else: 57 | parser = GeneratorHTMLParser() 58 | parser.feed(tweet["source"]) 59 | return {"link": parser.generator_link, 60 | "name": parser.generator_name} 61 | else: 62 | return {"link": tweet["generator"]["link"], 63 | "name": tweet["generator"]["displayName"]} 64 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_geo.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | 6 | 7 | def get_geo_coordinates(tweet): 8 | """ 9 | Get the user's geo coordinates, if they are included in the payload 10 | (otherwise return None) 11 | 12 | Args: 13 | tweet (Tweet or dict): A Tweet object or dictionary 14 | 15 | Returns: 16 | dict: dictionary with the keys "latitude" and "longitude" 17 | or, if unavaiable, None 18 | 19 | Example: 20 | >>> from tweet_parser.getter_methods.tweet_geo import get_geo_coordinates 21 | >>> tweet_geo = {"geo": {"coordinates": [1,-1]}} 22 | >>> get_geo_coordinates(tweet_geo) 23 | {'latitude': 1, 'longitude': -1} 24 | 25 | >>> tweet_no_geo = {"geo": {}} 26 | >>> get_geo_coordinates(tweet_no_geo) #returns None 27 | """ 28 | if "geo" in tweet: 29 | if tweet["geo"] is not None: 30 | if "coordinates" in tweet["geo"]: 31 | [lat, lon] = tweet["geo"]["coordinates"] 32 | return {"latitude": lat, "longitude": lon} 33 | return None 34 | 35 | 36 | def get_profile_location(tweet): 37 | """ 38 | Get user's derived location data from the profile location enrichment 39 | If unavailable, returns None. 40 | 41 | Args: 42 | tweet (Tweet or dict): Tweet object or dictionary 43 | 44 | Returns: 45 | dict: more information on the profile locations enrichment here: 46 | http://support.gnip.com/enrichments/profile_geo.html 47 | 48 | Example: 49 | >>> result = {"country": "US", # Two letter ISO-3166 country code 50 | ... "locality": "Boulder", # The locality location (~ city) 51 | ... "region": "Colorado", # The region location (~ state/province) 52 | ... "sub_region": "Boulder", # The sub-region location (~ county) 53 | ... "full_name": "Boulder, Colorado, US", # The full name (excluding sub-region) 54 | ... "geo": [40,-105] # lat/long value that coordinate that corresponds to 55 | ... # the lowest granularity location for where the user 56 | ... # who created the Tweet is from 57 | ... } 58 | 59 | Caveats: 60 | This only returns the first element of the 'locations' list. 61 | I'm honestly not sure what circumstances would result in a list that 62 | is more than one element long. 63 | """ 64 | if is_original_format(tweet): 65 | try: 66 | return tweet["user"]["derived"]["locations"][0] 67 | except KeyError: 68 | return None 69 | else: 70 | try: 71 | location = tweet["gnip"]["profileLocations"][0] 72 | reconstructed_original_format = {} 73 | if location["address"].get("country", None) is not None: 74 | reconstructed_original_format["country"] = location["address"]["country"] 75 | if location["address"].get("countryCode", None) is not None: 76 | reconstructed_original_format["country_code"] = location["address"]["countryCode"] 77 | if location["address"].get("locality", None) is not None: 78 | reconstructed_original_format["locality"] = location["address"]["locality"] 79 | if location["address"].get("region", None) is not None: 80 | reconstructed_original_format["region"] = location["address"]["region"] 81 | if location["address"].get("subRegion", None) is not None: 82 | reconstructed_original_format["sub_region"] = location["address"]["subRegion"] 83 | if location.get("displayName", None) is not None: 84 | reconstructed_original_format["full_name"] = location["displayName"] 85 | if location.get("geo", None) is not None: 86 | reconstructed_original_format["geo"] = location["geo"] 87 | return reconstructed_original_format 88 | except KeyError: 89 | return None 90 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_links.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | 6 | 7 | def get_tweet_links(tweet): 8 | """ 9 | Get the links that are included in the Tweet as "urls" 10 | (if there are no links in the Tweet, this returns an empty list) 11 | This includes links that are included in quoted or retweeted Tweets 12 | Returns unrolled or expanded_url information if it is available 13 | 14 | Args: 15 | tweet (Tweet): A Tweet object (must be a Tweet obj, not a dict) 16 | 17 | Returns: 18 | list (list of dicts): A list of dictionaries containing information 19 | about urls. Each dictionary entity can have these keys; without 20 | unwound url or expanded url Twitter data enrichments many of these 21 | fields will be missing. \n 22 | More information about the Twitter url enrichments at: 23 | http://support.gnip.com/enrichments/expanded_urls.html and 24 | http://support.gnip.com/enrichments/enhanced_urls.html 25 | 26 | Example: 27 | >>> result = [ 28 | ... { 29 | ... # url that shows up in the tweet text 30 | ... 'display_url': "https://twitter.com/RobotPrinc...", 31 | ... # long (expanded) url 32 | ... 'expanded_url': "https://twitter.com/RobotPrincessFi", 33 | ... # characters where the display link is 34 | ... 'indices': [55, 88], 35 | ... 'unwound': { 36 | ... # description from the linked webpage 37 | ... 'description': "the Twitter profile of RobotPrincessFi", 38 | ... 'status': 200, 39 | ... # title of the webpage 40 | ... 'title': "the Twitter profile of RobotPrincessFi", 41 | ... # long (expanded) url} 42 | ... 'url': "https://twitter.com/RobotPrincessFi"}, 43 | ... # the url that tweet directs to, often t.co 44 | ... 'url': "t.co/1234"}] 45 | """ 46 | if is_original_format(tweet): 47 | # get the urls from the Tweet 48 | try: 49 | tweet_urls = tweet["entities"]["urls"] 50 | except KeyError: 51 | tweet_urls = [] 52 | # get the urls from the quote-tweet 53 | if tweet.quoted_tweet is not None: 54 | tweet_urls += tweet.quoted_tweet.tweet_links 55 | # get the urls from the retweet 56 | if tweet.retweeted_tweet is not None: 57 | tweet_urls += tweet.retweeted_tweet.tweet_links 58 | return tweet_urls 59 | else: 60 | # try to get normal urls 61 | try: 62 | tweet_urls = tweet["twitter_entities"]["urls"] 63 | except KeyError: 64 | tweet_urls = [] 65 | # get the urls from the quote-tweet 66 | if tweet.quoted_tweet is not None: 67 | tweet_urls += tweet.quoted_tweet.tweet_links 68 | # get the urls from the retweet 69 | if tweet.retweeted_tweet is not None: 70 | tweet_urls += tweet.retweeted_tweet.tweet_links 71 | # otherwise, we're now going to combine the urls to try to 72 | # to get the same format as the og format urls, try to get enriched urls 73 | try: 74 | gnip_tweet_urls = {x["url"]: x for x in tweet["gnip"]["urls"]} 75 | gnip_tweet_exp_urls = {x["expanded_url"]: x for x in tweet["gnip"]["urls"]} 76 | except KeyError: 77 | return tweet_urls 78 | key_mappings = {"expanded_url": "url", 79 | "expanded_status": "status", 80 | "expanded_url_title": "title", 81 | "expanded_url_description": "description"} 82 | tweet_urls_expanded = [] 83 | for url in tweet_urls: 84 | expanded_url = url 85 | if url["url"] in gnip_tweet_urls: 86 | expanded_url["unwound"] = {key_mappings[key]: value for key, value in gnip_tweet_urls[url["url"]].items() if key != "url"} 87 | elif url.get("expanded_url", "UNAVAILABLE") in gnip_tweet_exp_urls: 88 | expanded_url["unwound"] = {key_mappings[key]: value for key, value in gnip_tweet_urls[url["expanded_url"]].items() if key != "url"} 89 | tweet_urls_expanded.append(expanded_url) 90 | return tweet_urls_expanded 91 | 92 | 93 | def get_most_unrolled_urls(tweet): 94 | """ 95 | For each url included in the Tweet "urls", get the most unrolled 96 | version available. Only return 1 url string per url in tweet.tweet_links 97 | In order of preference for "most unrolled" 98 | (keys from the dict at tweet.tweet_links): \n 99 | 1. `unwound`/`url` \n 100 | 2. `expanded_url` \n 101 | 3. `url` 102 | 103 | Args: 104 | tweet (Tweet): A Tweet object or dict 105 | 106 | Returns: 107 | list (list of strings): a list of the most unrolled url available 108 | """ 109 | unrolled_urls = [] 110 | for url in get_tweet_links(tweet): 111 | if url.get("unwound", {"url": None}).get("url", None) is not None: 112 | unrolled_urls.append(url["unwound"]["url"]) 113 | elif url.get("expanded_url", None) is not None: 114 | unrolled_urls.append(url["expanded_url"]) 115 | else: 116 | unrolled_urls.append(url["url"]) 117 | return unrolled_urls 118 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_reply.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Twitter, Inc. 2 | # Licensed under the MIT License 3 | # https://opensource.org/licenses/MIT 4 | from tweet_parser.tweet_checking import is_original_format 5 | from tweet_parser.tweet_parser_errors import NotAvailableError 6 | 7 | 8 | def get_in_reply_to_screen_name(tweet): 9 | """ 10 | Get the screen name of the user whose Tweet is being replied to, None 11 | if this Tweet is not a reply 12 | 13 | Args: 14 | tweet (Tweet): A Tweet object (or a dictionary) 15 | 16 | Returns: 17 | str: the screen name of the user whose Tweet is being replied to 18 | (None if not a reply) 19 | 20 | Example: 21 | >>> from tweet_parser.getter_methods.tweet_reply import * 22 | >>> original_format_dict = { 23 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 24 | ... "in_reply_to_screen_name": "notFromShrek" 25 | ... } 26 | >>> get_in_reply_to_screen_name(original_format_dict) 27 | 'notFromShrek' 28 | 29 | >>> activity_streams_format_dict = { 30 | ... "postedTime": "2017-05-24T20:17:19.000Z", 31 | ... "inReplyTo": 32 | ... {"link": "http://twitter.com/notFromShrek/statuses/863566329168711681"} 33 | ... } 34 | >>> get_in_reply_to_screen_name(activity_streams_format_dict) 35 | 'notFromShrek' 36 | """ 37 | 38 | if is_original_format(tweet): 39 | return tweet["in_reply_to_screen_name"] 40 | else: 41 | if tweet.get("inReplyTo", None) is not None: 42 | return tweet["inReplyTo"]["link"].split("/")[-3] 43 | else: 44 | return None 45 | 46 | 47 | def get_in_reply_to_user_id(tweet): 48 | """ 49 | Get the user id of the uesr whose Tweet is being replied to, and None 50 | if this Tweet is not a reply. \n 51 | Note that this is unavailable in activity-streams format 52 | 53 | Args: 54 | tweet (Tweet): A Tweet object (or a dictionary) 55 | 56 | Returns: 57 | str: the user id of the user whose Tweet is being replied to, None 58 | (if not a reply), or for activity-streams raise a NotAvailableError 59 | 60 | Example: 61 | >>> from tweet_parser.getter_methods.tweet_reply import * 62 | >>> original_format_dict = { 63 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 64 | ... "in_reply_to_user_id_str": "2382763597" 65 | ... } 66 | >>> get_in_reply_to_user_id(original_format_dict) 67 | '2382763597' 68 | """ 69 | 70 | if is_original_format(tweet): 71 | return tweet["in_reply_to_user_id_str"] 72 | else: 73 | raise NotAvailableError("Gnip activity-streams format does not" + 74 | " return the replied to user's id") 75 | 76 | 77 | def get_in_reply_to_status_id(tweet): 78 | """ 79 | Get the tweet id of the Tweet being replied to, None 80 | if this Tweet is not a reply 81 | 82 | Args: 83 | tweet (Tweet): A Tweet object (or a dictionary) 84 | 85 | Returns: 86 | str: the tweet id of the Tweet being replied to 87 | (None if not a reply) 88 | 89 | Example: 90 | >>> from tweet_parser.getter_methods.tweet_reply import * 91 | >>> original_format_dict = { 92 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 93 | ... "in_reply_to_status_id_str": "863566329168711681" 94 | ... } 95 | >>> get_in_reply_to_status_id(original_format_dict) 96 | '863566329168711681' 97 | 98 | >>> activity_streams_format_dict = { 99 | ... "postedTime": "2017-05-24T20:17:19.000Z", 100 | ... "inReplyTo": 101 | ... {"link": "http://twitter.com/notFromShrek/statuses/863566329168711681"} 102 | ... } 103 | >>> get_in_reply_to_status_id(activity_streams_format_dict) 104 | '863566329168711681' 105 | """ 106 | if is_original_format(tweet): 107 | return tweet["in_reply_to_status_id_str"] 108 | else: 109 | if tweet.get("inReplyTo", None) is not None: 110 | return tweet["inReplyTo"]["link"].split("/")[-1] 111 | else: 112 | return None 113 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_text.py: -------------------------------------------------------------------------------- 1 | from tweet_parser.tweet_checking import is_original_format 2 | from tweet_parser.tweet_parser_errors import NotAvailableError 3 | import re 4 | 5 | 6 | def get_full_text(tweet): 7 | """ 8 | Get the full text of a tweet dict. 9 | Includes @-mention replies and long links. 10 | 11 | Args: 12 | tweet (Tweet or dict): A Tweet object or dictionary 13 | 14 | Returns: 15 | str: the untruncated text of a Tweet 16 | (finds extended text if available) 17 | 18 | Example: 19 | >>> from tweet_parser.getter_methods.tweet_text import get_full_text 20 | >>> # getting the text of a Tweet that is not truncated 21 | >>> original_untruncated = { 22 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 23 | ... "truncated": False, 24 | ... "text": "some tweet text" 25 | ... } 26 | >>> get_full_text(original_untruncated) 27 | 'some tweet text' 28 | 29 | >>> activity_untruncated = {"postedTime": "2017-05-24T20:17:19.000Z", 30 | ... "body": "some tweet text" 31 | ... } 32 | >>> get_full_text(activity_untruncated) 33 | 'some tweet text' 34 | 35 | >>> # getting the text of a truncated Tweet (has over 140 chars) 36 | >>> original_truncated = { 37 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 38 | ... "text": "some tweet text, lorem ip...", 39 | ... "truncated": True, 40 | ... "extended_tweet": 41 | ... {"full_text": 42 | ... "some tweet text, lorem ipsum dolor sit amet"} 43 | ... } 44 | >>> get_full_text(original_truncated) 45 | 'some tweet text, lorem ipsum dolor sit amet' 46 | 47 | >>> activity_truncated = { 48 | ... "postedTime": "2017-05-24T20:17:19.000Z", 49 | ... "body": "some tweet text, lorem ip...", 50 | ... "long_object": 51 | ... {"body": 52 | ... "some tweet text, lorem ipsum dolor sit amet"} 53 | ... } 54 | >>> get_full_text(activity_truncated) 55 | 'some tweet text, lorem ipsum dolor sit amet' 56 | """ 57 | if is_original_format(tweet): 58 | if tweet["truncated"]: 59 | return tweet["extended_tweet"]["full_text"] 60 | else: 61 | return tweet["text"] 62 | else: 63 | if "long_object" in tweet: 64 | return tweet["long_object"]["body"] 65 | else: 66 | return tweet["body"] 67 | 68 | 69 | def get_text(tweet): 70 | """ 71 | Get the contents of "text" (original format) 72 | or "body" (activity streams format) 73 | 74 | Args: 75 | tweet (Tweet or dict): A Tweet object or dictionary 76 | 77 | Returns: 78 | str: the contents of "text" key (original format) 79 | or "body" key (activity streams format) 80 | 81 | Example: 82 | >>> from tweet_parser.getter_methods.tweet_text import get_text 83 | >>> original = { 84 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 85 | ... "text": "some tweet text"} 86 | >>> get_text(original) 87 | 'some tweet text' 88 | 89 | >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z", 90 | ... "body": "some tweet text"} 91 | >>> get_text(activity) 92 | 'some tweet text' 93 | """ 94 | if is_original_format(tweet): 95 | return tweet["text"] 96 | else: 97 | return tweet["body"] 98 | 99 | 100 | def get_tweet_type(tweet): 101 | """ 102 | Get the type of Tweet this is (3 options: tweet, quote, and retweet) 103 | 104 | Args: 105 | tweet (Tweet or dict): A Tweet object or dictionary 106 | 107 | Returns: 108 | str: (one of 3 strings) 109 | "tweet": an original Tweet 110 | "retweet": a native retweet (created with the retweet button) 111 | "quote": a native quote tweet (etweet button + adding quote text) 112 | 113 | Caveats: 114 | When a quote-tweet (tweet A) is quote-tweeted (tweet B), 115 | the innermost quoted tweet (A) in the payload (for B) 116 | no longer has the key "quoted_status" or "twitter_quoted_status", 117 | and that tweet (A) would be labeled as a "tweet" (not a "quote"). 118 | """ 119 | if is_original_format(tweet): 120 | if "retweeted_status" in tweet: 121 | return "retweet" 122 | elif "quoted_status" in tweet: 123 | return "quote" 124 | else: 125 | return "tweet" 126 | else: 127 | if tweet["verb"] == "share": 128 | return "retweet" 129 | else: 130 | if "twitter_quoted_status" in tweet: 131 | return "quote" 132 | else: 133 | return "tweet" 134 | 135 | 136 | def get_lang(tweet): 137 | """ 138 | Get the language that the Tweet is written in. 139 | 140 | Args: 141 | tweet (Tweet or dict): A Tweet object or dictionary 142 | 143 | Returns: 144 | str: 2-letter BCP 47 language code (or None if undefined) 145 | 146 | Example: 147 | >>> from tweet_parser.getter_methods.tweet_text import get_lang 148 | >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017", 149 | ... "lang": "en"} 150 | >>> get_lang(original) 151 | 'en' 152 | 153 | >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z", 154 | ... "twitter_lang": "en"} 155 | >>> get_lang(activity) 156 | 'en' 157 | """ 158 | if is_original_format(tweet): 159 | lang_field = "lang" 160 | else: 161 | lang_field = "twitter_lang" 162 | if tweet[lang_field] is not None and tweet[lang_field] != "und": 163 | return tweet[lang_field] 164 | else: 165 | return None 166 | 167 | 168 | def get_poll_options(tweet): 169 | """ 170 | Get the text in the options of a poll as a list 171 | - If there is no poll in the Tweet, return an empty list 172 | - If the Tweet is in activity-streams format, raise 'NotAvailableError' 173 | 174 | Args: 175 | tweet (Tweet or dict): A Tweet object or dictionary 176 | 177 | Returns: 178 | list: list of strings, or, in the case where there is no poll, 179 | an empty list 180 | 181 | Raises: 182 | NotAvailableError for activity-streams format 183 | 184 | Example: 185 | >>> from tweet_parser.getter_methods.tweet_text import get_poll_options 186 | >>> original = { 187 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 188 | ... "entities": {"polls": [{"options": [{"text":"a"}, 189 | ... {"text":"b"}, 190 | ... {"text":"c"}] 191 | ... }]}, 192 | ... } 193 | >>> get_poll_options(original) 194 | ['a', 'b', 'c'] 195 | 196 | >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z", 197 | ... "body": "some tweet text"} 198 | >>> get_poll_options(activity) 199 | Traceback (most recent call last): 200 | ... 201 | NotAvailableError: Gnip activity-streams format does not return poll options 202 | """ 203 | if is_original_format(tweet): 204 | try: 205 | poll_options_text = [] 206 | for p in tweet["entities"]["polls"]: 207 | for o in p["options"]: 208 | poll_options_text.append(o["text"]) 209 | return poll_options_text 210 | except KeyError: 211 | return [] 212 | 213 | else: 214 | raise NotAvailableError("Gnip activity-streams format does not" + 215 | " return poll options") 216 | 217 | 218 | def get_quote_or_rt_text(tweet): 219 | """ 220 | Get the quoted or retweeted text in a Tweet 221 | (this is not the text entered by the posting user) 222 | - tweet: empty string (there is no quoted or retweeted text) 223 | - quote: only the text of the quoted Tweet 224 | - retweet: the text of the retweet 225 | 226 | Args: 227 | tweet (Tweet or dict): A Tweet object or dictionary 228 | 229 | Returns: 230 | str: text of the retweeted-tweet or the quoted-tweet 231 | (empty string if this is an original Tweet) 232 | 233 | Example: 234 | >>> from tweet_parser.getter_methods.tweet_text import get_quote_or_rt_text 235 | >>> # a quote tweet 236 | >>> quote = {"created_at": "Wed May 24 20:17:19 +0000 2017", 237 | ... "text": "adding my own commentary", 238 | ... "truncated": False, 239 | ... "quoted_status": { 240 | ... "created_at": "Mon May 01 05:00:05 +0000 2017", 241 | ... "truncated": False, 242 | ... "text": "an interesting Tweet" 243 | ... } 244 | ... } 245 | 246 | >>> get_quote_or_rt_text(quote) 247 | 'an interesting Tweet' 248 | """ 249 | tweet_type = get_tweet_type(tweet) 250 | if tweet_type == "tweet": 251 | return "" 252 | if tweet_type == "quote": 253 | if is_original_format(tweet): 254 | return get_full_text(tweet["quoted_status"]) 255 | else: 256 | return get_full_text(tweet["twitter_quoted_status"]) 257 | if tweet_type == "retweet": 258 | if is_original_format(tweet): 259 | return get_full_text(tweet["retweeted_status"]) 260 | else: 261 | return get_full_text(tweet["object"]) 262 | 263 | 264 | def get_all_text(tweet): 265 | """ 266 | Get all of the text of the tweet. This includes @ mentions, long links, 267 | quote-tweet contents (separated by a newline), RT contents & poll options 268 | 269 | Args: 270 | tweet (Tweet): A Tweet object (must be a Tweet object) 271 | 272 | Returns: 273 | str: text from tweet.user_entered_text, tweet.quote_or_rt_text and 274 | tweet.poll_options (if in original format), separated by newlines 275 | """ 276 | if is_original_format(tweet): 277 | return "\n".join(filter(None, [tweet.user_entered_text, 278 | tweet.quote_or_rt_text, 279 | "\n".join(tweet.poll_options)])) 280 | else: 281 | return "\n".join(filter(None, [tweet.user_entered_text, 282 | tweet.quote_or_rt_text])) 283 | 284 | 285 | def remove_links(text): 286 | """ 287 | Helper function to remove the links from the input text 288 | 289 | Args: 290 | text (str): A string 291 | 292 | Returns: 293 | str: the same text, but with any substring that matches the regex 294 | for a link removed and replaced with a space 295 | 296 | Example: 297 | >>> from tweet_parser.getter_methods.tweet_text import remove_links 298 | >>> text = "lorem ipsum dolor https://twitter.com/RobotPrincessFi" 299 | >>> remove_links(text) 300 | 'lorem ipsum dolor ' 301 | """ 302 | tco_link_regex = re.compile("https?://t.co/[A-z0-9].*") 303 | generic_link_regex = re.compile("(https?://)?(\w*[.]\w+)+([/?=&]+\w+)*") 304 | remove_tco = re.sub(tco_link_regex, " ", text) 305 | remove_generic = re.sub(generic_link_regex, " ", remove_tco) 306 | return remove_generic 307 | -------------------------------------------------------------------------------- /tweet_parser/getter_methods/tweet_user.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | 6 | from tweet_parser.tweet_checking import is_original_format 7 | from tweet_parser.deprecator import deprecated 8 | 9 | def get_user_id(tweet): 10 | """ 11 | Get the Twitter ID of the user who posted the Tweet 12 | 13 | Args: 14 | tweet (Tweet): A Tweet object (or a dictionary) 15 | 16 | Returns: 17 | str: the Twitter ID of the user who posted the Tweet 18 | 19 | Example: 20 | >>> from tweet_parser.getter_methods.tweet_user import get_user_id 21 | >>> original_format_dict = { 22 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 23 | ... "user": 24 | ... {"id_str": "815279070241955840"} 25 | ... } 26 | >>> get_user_id(original_format_dict) 27 | '815279070241955840' 28 | 29 | >>> activity_streams_format_dict = { 30 | ... "postedTime": "2017-05-24T20:17:19.000Z", 31 | ... "actor": 32 | ... {"id": "id:twitter.com:815279070241955840"} 33 | ... } 34 | >>> get_user_id(activity_streams_format_dict) 35 | '815279070241955840' 36 | """ 37 | 38 | if is_original_format(tweet): 39 | return tweet["user"]["id_str"] 40 | else: 41 | return tweet["actor"]["id"].split(":")[-1] 42 | 43 | 44 | def get_screen_name(tweet): 45 | """ 46 | Get the screen name (@ handle) of the user who posted the Tweet 47 | 48 | Args: 49 | tweet (Tweet): A Tweet object (or a dictionary) 50 | 51 | Returns: 52 | str: the @ handle of the user who posted the Tweet 53 | 54 | Example: 55 | >>> from tweet_parser.getter_methods.tweet_user import get_screen_name 56 | >>> original_format_dict = { 57 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 58 | ... "user": 59 | ... {"screen_name": "RobotPrincessFi"} 60 | ... } 61 | >>> get_screen_name(original_format_dict) 62 | 'RobotPrincessFi' 63 | 64 | >>> activity_streams_format_dict = { 65 | ... "postedTime": "2017-05-24T20:17:19.000Z", 66 | ... "actor": 67 | ... {"preferredUsername": "RobotPrincessFi"} 68 | ... } 69 | >>> get_screen_name(activity_streams_format_dict) 70 | 'RobotPrincessFi' 71 | """ 72 | 73 | if is_original_format(tweet): 74 | return tweet["user"]["screen_name"] 75 | else: 76 | return tweet["actor"]["preferredUsername"] 77 | 78 | 79 | def get_name(tweet): 80 | """ 81 | Get the display name of the user who posted the Tweet 82 | 83 | Args: 84 | tweet (Tweet): A Tweet object (or a dictionary) 85 | 86 | Returns: 87 | str: the @ handle of the user who posted the Tweet 88 | 89 | Example: 90 | >>> from tweet_parser.getter_methods.tweet_user import get_name 91 | >>> original_format_dict = { 92 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 93 | ... "user": 94 | ... {"name": "jk no"} 95 | ... } 96 | >>> get_name(original_format_dict) 97 | 'jk no' 98 | 99 | >>> activity_streams_format_dict = { 100 | ... "postedTime": "2017-05-24T20:17:19.000Z", 101 | ... "actor": 102 | ... {"displayName": "jk no"} 103 | ... } 104 | >>> get_name(activity_streams_format_dict) 105 | 'jk no' 106 | """ 107 | 108 | if is_original_format(tweet): 109 | return tweet["user"]["name"] 110 | else: 111 | return tweet["actor"]["displayName"] 112 | 113 | 114 | def get_bio(tweet): 115 | """ 116 | Get the bio text of the user who posted the Tweet 117 | 118 | Args: 119 | tweet (Tweet): A Tweet object (or a dictionary) 120 | 121 | Returns: 122 | str: the bio text of the user who posted the Tweet 123 | In a payload the abscence of a bio seems to be represented by an 124 | empty string or a None, this getter always returns a string (so, empty 125 | string if no bio is available). 126 | 127 | Example: 128 | >>> from tweet_parser.getter_methods.tweet_user import get_bio 129 | >>> original_format_dict = { 130 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 131 | ... "user": 132 | ... {"description": "Niche millenial content aggregator"} 133 | ... } 134 | >>> get_bio(original_format_dict) 135 | 'Niche millenial content aggregator' 136 | 137 | >>> activity_streams_format_dict = { 138 | ... "postedTime": "2017-05-24T20:17:19.000Z", 139 | ... "actor": 140 | ... {"summary": "Niche millenial content aggregator"} 141 | ... } 142 | >>> get_bio(activity_streams_format_dict) 143 | 'Niche millenial content aggregator' 144 | """ 145 | 146 | if is_original_format(tweet): 147 | bio_or_none = tweet["user"].get("description", "") 148 | else: 149 | bio_or_none = tweet["actor"].get("summary", "") 150 | if bio_or_none is None: 151 | return "" 152 | else: 153 | return bio_or_none 154 | 155 | 156 | def get_follower_count(tweet): 157 | """ 158 | Get the number of followers that the user has 159 | 160 | Args: 161 | tweet (Tweet): A Tweet object (or a dictionary) 162 | 163 | Returns: 164 | int: the number of followers that the user has 165 | 166 | Example: 167 | >>> from tweet_parser.getter_methods.tweet_user import get_follower_count 168 | >>> original_format_dict = { 169 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 170 | ... "user": 171 | ... {"followers_count": 2} 172 | ... } 173 | >>> get_follower_count(original_format_dict) 174 | 2 175 | 176 | >>> activity_streams_format_dict = { 177 | ... "postedTime": "2017-05-24T20:17:19.000Z", 178 | ... "actor": 179 | ... {"followersCount": 2} 180 | ... } 181 | >>> get_follower_count(activity_streams_format_dict) 182 | 2 183 | """ 184 | if is_original_format(tweet): 185 | return tweet["user"]["followers_count"] 186 | else: 187 | return tweet["actor"]["followersCount"] 188 | 189 | 190 | def get_following_count(tweet): 191 | """ 192 | Get the number of accounts that the user is following 193 | 194 | Args: 195 | tweet (Tweet): A Tweet object (or a dictionary) 196 | 197 | Returns: 198 | int: the number of accounts that the user is following 199 | 200 | Example: 201 | >>> from tweet_parser.getter_methods.tweet_user import get_following_count 202 | >>> original_format_dict = { 203 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 204 | ... "user": 205 | ... {"friends_count": 2} 206 | ... } 207 | >>> get_following_count(original_format_dict) 208 | 2 209 | 210 | >>> activity_streams_format_dict = { 211 | ... "postedTime": "2017-05-24T20:17:19.000Z", 212 | ... "actor": 213 | ... {"friendsCount": 2} 214 | ... } 215 | >>> get_following_count(activity_streams_format_dict) 216 | 2 217 | """ 218 | if is_original_format(tweet): 219 | return tweet["user"]["friends_count"] 220 | else: 221 | return tweet["actor"]["friendsCount"] 222 | 223 | 224 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout") 225 | def get_klout_score(tweet): 226 | """ 227 | Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n 228 | See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n 229 | 230 | Get the Klout score (int) (if it exists) of the user who posted the Tweet 231 | 232 | Args: 233 | tweet (Tweet): A Tweet object (or a dictionary) 234 | 235 | Returns: 236 | int: the Klout score (if it exists) of the user who posted the Tweet 237 | else return None 238 | 239 | Example: 240 | >>> from tweet_parser.getter_methods.tweet_user import get_klout_score 241 | >>> original_format_dict = { 242 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 243 | ... "user": 244 | ... {"derived": {"klout": {"score": 12345}}} 245 | ... } 246 | >>> get_klout_score(original_format_dict) 247 | 12345 248 | 249 | >>> activity_streams_format_dict = { 250 | ... "postedTime": "2017-05-24T20:17:19.000Z", 251 | ... "gnip":{"klout_score": 12345}} 252 | >>> get_klout_score(activity_streams_format_dict) 253 | 12345 254 | """ 255 | try: 256 | if is_original_format(tweet): 257 | score = tweet['user']['derived']['klout']['score'] 258 | else: 259 | score = tweet['gnip']['klout_score'] 260 | return score 261 | except KeyError: 262 | return None 263 | 264 | 265 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout") 266 | def get_klout_profile(tweet): 267 | """ 268 | Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n 269 | See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n 270 | 271 | Get the Klout profile URL of the user (str) (if it exists) 272 | 273 | Args: 274 | tweet (Tweet): A Tweet object (or a dictionary) 275 | 276 | Returns: 277 | str: the user's Klout profile URL (if it exists), else return None 278 | 279 | Example: 280 | >>> from tweet_parser.getter_methods.tweet_user import get_klout_profile 281 | >>> original_format_dict = { 282 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 283 | ... "user": 284 | ... {"derived": {"klout": 285 | ... {"profile_url": 286 | ... "http://klout.com/topic/id/10000000000000016635"}}} 287 | ... } 288 | >>> get_klout_profile(original_format_dict) 289 | 'http://klout.com/topic/id/10000000000000016635' 290 | 291 | >>> activity_streams_format_dict = { 292 | ... "postedTime": "2017-05-24T20:17:19.000Z", 293 | ... "gnip": 294 | ... {"klout_profile": { 295 | ... "link": "http://klout.com/topic/id/10000000000000016635"} 296 | ... } 297 | ... } 298 | >>> get_klout_profile(activity_streams_format_dict) 299 | 'http://klout.com/topic/id/10000000000000016635' 300 | """ 301 | try: 302 | if is_original_format(tweet): 303 | profile = tweet['user']['derived']['klout']['profile_url'] 304 | else: 305 | profile = tweet['gnip']['klout_profile']['link'] 306 | return profile 307 | except KeyError: 308 | return None 309 | 310 | 311 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout") 312 | def get_klout_id(tweet): 313 | """ 314 | Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n 315 | See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n 316 | 317 | Get the Klout ID of the user (str) (if it exists) 318 | 319 | Args: 320 | tweet (Tweet): A Tweet object (or a dictionary) 321 | 322 | Returns: 323 | str: the user's Klout ID (if it exists), else return None 324 | 325 | Example: 326 | >>> from tweet_parser.getter_methods.tweet_user import get_klout_id 327 | >>> original_format_dict = { 328 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 329 | ... "user": 330 | ... {"derived": {"klout": 331 | ... {"user_id":"1234567890"}}} 332 | ... } 333 | >>> get_klout_id(original_format_dict) 334 | '1234567890' 335 | 336 | >>> activity_streams_format_dict = { 337 | ... "postedTime": "2017-05-24T20:17:19.000Z", 338 | ... "gnip": 339 | ... {"klout_profile": { 340 | ... "klout_user_id": "1234567890"} 341 | ... }} 342 | >>> get_klout_id(activity_streams_format_dict) 343 | '1234567890' 344 | """ 345 | try: 346 | if is_original_format(tweet): 347 | klout_id = tweet['user']['derived']['klout']['user_id'] 348 | else: 349 | klout_id = tweet['gnip']['klout_profile']['klout_user_id'] 350 | return klout_id 351 | except KeyError: 352 | return None 353 | 354 | 355 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout") 356 | def get_klout_topics(tweet, topic_type='influence'): 357 | """ 358 | Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n 359 | See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n 360 | 361 | Get the user's chosen Klout topics (a list of dicts), if it exists. 362 | Regardless of format or topic type, topic dicts will have the same keys: 363 | "url", "id", "name", "score" 364 | 365 | Args: 366 | tweet (Tweet): A Tweet object 367 | topic_type (str): Which type of Klout topic to return. 368 | Options are limited to 'influence' and 'interest' 369 | 370 | Returns: 371 | list: A list of dicts representing Klout topics, or if Klout topics \ 372 | do not exist in the Tweet payload, return None. The list is sorted by 373 | the "score" value. 374 | 375 | Example: 376 | >>> result = [{ 377 | ... # the user's score for that topic 378 | ... "score": 0.54, 379 | ... # the Klout topic ID 380 | ... "id": "10000000000000019376", 381 | ... # the Klout topic URL 382 | ... "url": "http://klout.com/topic/id/10000000000000019376", 383 | ... # the Klout topic name 384 | ... "name": "Emoji" 385 | ... }, 386 | ... { 387 | ... "score": 0.43, 388 | ... "id": "9159", 389 | ... "url": "http://klout.com/topic/id/9159", 390 | ... "name": "Vegetables" 391 | ... }] 392 | """ 393 | try: 394 | # check that the dict paths exist 395 | if is_original_format(tweet): 396 | topics = tweet['user']['derived']['klout']['{}_topics'.format(topic_type)] 397 | else: 398 | topics = tweet['gnip']['klout_profile']['topics'] 399 | except KeyError: 400 | return None 401 | # since we have topics, collect the right pieces 402 | topics_list = [] 403 | if is_original_format(tweet): 404 | for topic in topics: 405 | # note: this is the same as the current structure of OF 406 | # payloads, but is written out for consistency w/ AS payloads 407 | this_topic = dict(url=topic['url'], 408 | id=topic['id'], 409 | name=topic['name'], 410 | score=topic['score']) 411 | topics_list.append(this_topic) 412 | else: 413 | relevant_topics = [x for x in topics if x['topic_type'] == topic_type] 414 | for topic in relevant_topics: 415 | this_topic = dict(url=topic['link'], 416 | id=topic['id'], 417 | name=topic['displayName'], 418 | score=topic['score']) 419 | topics_list.append(this_topic) 420 | sorted_topics_list = sorted(topics_list, key=lambda x: x['score']) 421 | return sorted_topics_list 422 | -------------------------------------------------------------------------------- /tweet_parser/lazy_property.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | """Module to define a lazy property decorator that allows 6 | attributes to be generated dynamically and cached after creation. 7 | Original idea found via 8 | http://stevenloria.com/lazy-evaluated-properties-in-python/ 9 | and lightly modified to preserve underlying docstrings. 10 | """ 11 | from functools import wraps 12 | 13 | def lazy_property(fn): 14 | """ 15 | Decorator that makes a property lazy-evaluated whilst preserving 16 | docstrings. 17 | 18 | Args: 19 | fn (function): the property in question 20 | 21 | Returns: 22 | evaluated version of the property. 23 | """ 24 | attr_name = '_lazy_' + fn.__name__ 25 | 26 | @property 27 | @wraps(fn) 28 | def _lazy_property(self): 29 | if not hasattr(self, attr_name): 30 | setattr(self, attr_name, fn(self)) 31 | return getattr(self, attr_name) 32 | return _lazy_property 33 | -------------------------------------------------------------------------------- /tweet_parser/tweet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | import datetime 6 | 7 | from tweet_parser.lazy_property import lazy_property 8 | from tweet_parser.tweet_parser_errors import NotATweetError 9 | from tweet_parser import tweet_checking 10 | from tweet_parser.getter_methods import tweet_date, tweet_user, tweet_counts 11 | from tweet_parser.getter_methods import tweet_text, tweet_geo, tweet_links 12 | from tweet_parser.getter_methods import tweet_entities, tweet_embeds 13 | from tweet_parser.getter_methods import gnip_fields, tweet_generator, tweet_reply 14 | 15 | 16 | class Tweet(dict): 17 | """ 18 | Tweet object created from a dictionary representing a Tweet paylaod 19 | 20 | Args: 21 | tweet_dict (dict): A dictionary representing a Tweet payload 22 | do_format_checking (bool): If "True", compare the keys in this \ 23 | dict to a supeset of expected keys and to a minimum set of expected \ 24 | keys (as defined in tweet_parser.tweet_keys). \ 25 | Will cause the parser to fail if unexpected keys are present \ 26 | or if expected keys are missing. \ 27 | Intended to allow run-time format testing, allowing the user \ 28 | to surface unexpected format changes. 29 | 30 | Returns: 31 | Tweet: Class "Tweet", inherits from dict, provides properties to 32 | get various data values from the Tweet. 33 | 34 | Raises: 35 | NotATweetError: the Tweet dict is malformed, \ 36 | see `tweet_checking.check_tweet` for details 37 | 38 | Example: 39 | >>> from tweet_parser.tweet import Tweet 40 | >>> # python dict representing a Tweet 41 | >>> tweet_dict = {"id": 867474613139156993, 42 | ... "id_str": "867474613139156993", 43 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 44 | ... "text": "Some Tweet text", 45 | ... "user": { 46 | ... "screen_name": "RobotPrincessFi", 47 | ... "id_str": "815279070241955840" 48 | ... } 49 | ... } 50 | >>> # create a Tweet object 51 | >>> tweet = Tweet(tweet_dict) 52 | >>> # use the Tweet obj to access data elements 53 | >>> tweet.id 54 | '867474613139156993' 55 | >>> tweet.created_at_seconds 56 | 1495657039 57 | """ 58 | def __init__(self, tweet_dict, do_format_validation=False): 59 | """ 60 | Initialize a Tweet object from a dict representing a Tweet payload 61 | """ 62 | 63 | # get the format of the Tweet data 64 | # also, this throws an error if it's not a tweet 65 | self.original_format = tweet_checking.check_tweet(tweet_dict, 66 | do_format_validation) 67 | 68 | # make sure that this obj has all of the keys that our dict had 69 | self.update(tweet_dict) 70 | 71 | @lazy_property 72 | def id(self): 73 | """ 74 | Tweet snowflake id as a string 75 | 76 | Returns: 77 | str: Twitter snowflake id, numeric only (no other text) 78 | 79 | Example: 80 | >>> from tweet_parser.tweet import Tweet 81 | >>> original_format_dict = { 82 | ... "created_at": "Wed May 24 20:17:19 +0000 2017", 83 | ... "id": 867474613139156993, 84 | ... "id_str": "867474613139156993", 85 | ... "user": {"user_keys":"user_data"}, 86 | ... "text": "some tweet text" 87 | ... } 88 | >>> Tweet(original_format_dict).id 89 | '867474613139156993' 90 | 91 | >>> activity_streams_dict = { 92 | ... "postedTime": "2017-05-24T20:17:19.000Z", 93 | ... "id": "tag:search.twitter.com,2005:867474613139156993", 94 | ... "actor": {"user_keys":"user_data"}, 95 | ... "body": "some tweet text" 96 | ... } 97 | >>> Tweet(activity_streams_dict).id 98 | '867474613139156993' 99 | """ 100 | if self.original_format: 101 | return self["id_str"] 102 | else: 103 | return self["id"].split(":")[-1] 104 | 105 | @lazy_property 106 | def created_at_seconds(self): 107 | """ 108 | Time that a Tweet was posted in seconds since the Unix epoch 109 | 110 | Returns: 111 | int: seconds since the unix epoch 112 | (determined by converting Tweet.id 113 | into a timestamp using `tweet_date.snowflake2utc`) 114 | """ 115 | return tweet_date.snowflake2utc(self.id) 116 | 117 | @lazy_property 118 | def created_at_datetime(self): 119 | """ 120 | Time that a Tweet was posted as a Python datetime object 121 | 122 | Returns: 123 | datetime.datetime: the value of `tweet.created_at_seconds` 124 | converted into a datetime object 125 | """ 126 | return datetime.datetime.utcfromtimestamp(self.created_at_seconds) 127 | 128 | @lazy_property 129 | def created_at_string(self): 130 | """ 131 | Time that a Tweet was posted as a string with the format 132 | YYYY-mm-ddTHH:MM:SS.000Z 133 | 134 | Returns: 135 | str: the value of `tweet.created_at_seconds` 136 | converted into a string (YYYY-mm-ddTHH:MM:SS.000Z) 137 | """ 138 | return self.created_at_datetime.strftime("%Y-%m-%dT%H:%M:%S.000Z") 139 | 140 | @lazy_property 141 | def user_id(self): 142 | """ 143 | The Twitter ID of the user who posted the Tweet 144 | 145 | Returns: 146 | str: value returned by calling `tweet_user.get_user_id` on `self` 147 | """ 148 | return tweet_user.get_user_id(self) 149 | 150 | @lazy_property 151 | def screen_name(self): 152 | """ 153 | The screen name (@ handle) of the user who posted the Tweet 154 | 155 | Returns: 156 | str: value returned by calling `tweet_user.get_screen_name` on `self` 157 | """ 158 | return tweet_user.get_screen_name(self) 159 | 160 | @lazy_property 161 | def name(self): 162 | """ 163 | The display name of the user who posted the Tweet 164 | 165 | Returns: 166 | str: value returned by calling `tweet_user.get_name` on `self` 167 | """ 168 | return tweet_user.get_name(self) 169 | 170 | @lazy_property 171 | def bio(self): 172 | """ 173 | The bio text of the user who posted the Tweet 174 | 175 | Returns: 176 | str: the user's bio text. 177 | value returned by calling `tweet_user.get_bio` on `self` 178 | """ 179 | return tweet_user.get_bio(self) 180 | 181 | @lazy_property 182 | def follower_count(self): 183 | """ 184 | The number of followers that the author of the Tweet has 185 | 186 | Returns: 187 | int: the number of followers. 188 | value returned by calling `get_follower_count` on `self` 189 | """ 190 | return tweet_user.get_follower_count(self) 191 | 192 | @lazy_property 193 | def following_count(self): 194 | """ 195 | The number of accounts that the author of the Tweet is following 196 | 197 | Returns: 198 | int: the number of accounts that the author of the Tweet is following, 199 | value returned by calling `get_following_count` on `self` 200 | """ 201 | return tweet_user.get_following_count(self) 202 | 203 | @lazy_property 204 | def klout_score(self): 205 | """ 206 | (DEPRECATED): 207 | The Klout score (int) (if it exists) of the user who posted the Tweet 208 | 209 | Returns: 210 | int: value returned by calling `tweet_user.get_klout_score` on `self` 211 | (if no Klout is present, this returns a None) 212 | """ 213 | return tweet_user.get_klout_score(self) 214 | 215 | @lazy_property 216 | def klout_profile(self): 217 | """ 218 | (DEPRECATED): 219 | The Klout profile URL of the user (`str`) (if it exists) 220 | 221 | Returns: 222 | str: value returned by calling `tweet_user.get_klout_profile` on `self` 223 | (if no Klout is present, this returns a `None`) 224 | """ 225 | return tweet_user.get_klout_profile(self) 226 | 227 | @lazy_property 228 | def klout_id(self): 229 | """ 230 | (DEPRECATED): 231 | The Klout ID of the user (`str`) (if it exists) 232 | 233 | Returns: 234 | str: value returned by calling `tweet_user.get_klout_id` on `self` 235 | (if no Klout is present, this returns a `None`) 236 | """ 237 | return tweet_user.get_klout_id(self) 238 | 239 | @lazy_property 240 | def klout_influence_topics(self): 241 | """ 242 | (DEPRECATED): 243 | Get the user's Klout influence topics (a list of dicts), if it exists. 244 | Topic dicts will have these keys: `url`, `id`, `name`, `score` 245 | 246 | Returns: 247 | list: value returned by calling 248 | `tweet_user.get_klout_topics(self, topic_type = 'influence')` 249 | (if no Klout is present, this returns a `None`) 250 | """ 251 | return tweet_user.get_klout_topics(self, topic_type='influence') 252 | 253 | @lazy_property 254 | def klout_interest_topics(self): 255 | """ 256 | (DEPRECATED): 257 | Get the user's Klout interest topics (a list of dicts), if it exists. 258 | Topic dicts will have these keys: `url`, `id`, `name`, `score` 259 | 260 | Returns: 261 | list: value returned by calling 262 | `tweet_user.get_klout_topics(self, topic_type = 'interest')` 263 | (if no Klout is present, this returns a `None`) 264 | """ 265 | return tweet_user.get_klout_topics(self, topic_type='interest') 266 | 267 | @lazy_property 268 | def text(self): 269 | """ 270 | The contents of "text" (original format) 271 | or "body" (activity streams format) 272 | 273 | Returns: 274 | str: value returned by calling `tweet_text.get_text` on `self` 275 | """ 276 | return tweet_text.get_text(self) 277 | 278 | @lazy_property 279 | def tweet_type(self): 280 | """ 281 | The type of Tweet this is (3 options: tweet, quote, and retweet) 282 | 283 | Returns: 284 | str: ("tweet","quote" or "retweet" only) 285 | value returned by calling `tweet_text.get_tweet_type` on `self` 286 | """ 287 | return tweet_text.get_tweet_type(self) 288 | 289 | @lazy_property 290 | def user_entered_text(self): 291 | """ 292 | The text that the posting user entered \n 293 | *tweet*: untruncated (includes @-mention replies and long links) 294 | text of an original Tweet \n 295 | *quote tweet*: untruncated poster-added content in a quote-tweet \n 296 | *retweet*: empty string 297 | 298 | Returns: 299 | str: if `tweet.tweet_type == "retweet"`, returns an empty string 300 | else, returns the value of `tweet_text.get_full_text(self)` 301 | """ 302 | if self.tweet_type == "retweet": 303 | return "" 304 | return tweet_text.get_full_text(self) 305 | 306 | @lazy_property 307 | def lang(self): 308 | """ 309 | The language that the Tweet is written in. 310 | 311 | Returns: 312 | str: 2-letter BCP 47 language code (or None if undefined) 313 | Value returned by calling `tweet_text.get_lang` on `self` 314 | """ 315 | return tweet_text.get_lang(self) 316 | 317 | @lazy_property 318 | def poll_options(self): 319 | """ 320 | The text in the options of a poll as a list. \ 321 | If there is no poll in the Tweet, return an empty list. \ 322 | If activity-streams format, raise `NotAvailableError` 323 | 324 | Returns: 325 | list (list of strings): value returned by calling 326 | `tweet_text.get_poll_options` on `self` 327 | """ 328 | return tweet_text.get_poll_options(self) 329 | 330 | @lazy_property 331 | def quote_or_rt_text(self): 332 | """ 333 | The quoted or retweeted text in a Tweet 334 | (this is not the text entered by the posting user) \n 335 | - tweet: empty string (there is no quoted or retweeted text) \n 336 | - quote: only the text of the quoted Tweet \n 337 | - retweet: the text of the retweet 338 | 339 | Returns: 340 | str: value returned by calling 341 | tweet_text.get_quote_or_rt_text on `self` 342 | """ 343 | return tweet_text.get_quote_or_rt_text(self) 344 | 345 | @lazy_property 346 | def all_text(self): 347 | """ 348 | All of the text of the tweet. This includes @ mentions, long links, 349 | quote-tweet contents (separated by a newline), RT contents 350 | & poll options 351 | 352 | Returns: 353 | str: value returned by calling `tweet_text.get_all_text` on `self` 354 | """ 355 | return tweet_text.get_all_text(self) 356 | 357 | @lazy_property 358 | def geo_coordinates(self): 359 | """ 360 | The user's geo coordinates, if they are included in the payload 361 | (otherwise return `None`). 362 | Dictionary with the keys "latitude" and "longitude" or `None` 363 | 364 | Returns: 365 | dict: value returned by calling `tweet_geo.get_geo_coordinates` on `self` 366 | """ 367 | return tweet_geo.get_geo_coordinates(self) 368 | 369 | @lazy_property 370 | def profile_location(self): 371 | """ 372 | User's derived location data from the profile location enrichment 373 | If unavailable, returns `None`. 374 | 375 | Returns: 376 | dict: value returned by calling tweet_geo.get_profile_location on `self` 377 | 378 | Example: 379 | >>> result = {"country": "US", # Two letter ISO-3166 country code 380 | ... "locality": "Boulder", # The locality location (~ city) 381 | ... "region": "Colorado", # The region location (~ state/province) 382 | ... "sub_region": "Boulder", # The sub-region location (~ county) 383 | ... "full_name": "Boulder, Colorado, US", # The full name (excluding sub-region) 384 | ... "geo": [40,-105] # lat/long value that coordinate that corresponds to 385 | ... # the lowest granularity location for where the user 386 | ... # who created the Tweet is from 387 | ... } 388 | """ 389 | return tweet_geo.get_profile_location(self) 390 | 391 | @lazy_property 392 | def tweet_links(self): 393 | """ 394 | The links that are included in the Tweet as "urls" 395 | (if there are no links, this is an empty list) 396 | This includes links that are included in quoted or retweeted Tweets 397 | Returns unrolled or expanded_url information if it is available 398 | 399 | Returns: 400 | list (list of dicts): A list of dictionaries containing information 401 | about urls. Each dictionary entity can have these keys; without 402 | unwound url or expanded url Twitter data enrichments many of these 403 | fields will be missing. 404 | (value returned by calling tweet_links.get_tweet_links on `self`) 405 | 406 | Example: 407 | >>> result = [ 408 | ... { 409 | ... # url that shows up in the tweet text 410 | ... 'display_url': "https://twitter.com/RobotPrinc...", 411 | ... # long (expanded) url 412 | ... 'expanded_url': "https://twitter.com/RobotPrincessFi", 413 | ... # characters where the display link is 414 | ... 'indices': [55, 88], 415 | ... 'unwound': { 416 | ... # description from the linked webpage 417 | ... 'description': "the Twitter profile of RobotPrincessFi", 418 | ... 'status': 200, 419 | ... # title of the webpage 420 | ... 'title': "the Twitter profile of RobotPrincessFi", 421 | ... # long (expanded) url} 422 | ... 'url': "https://twitter.com/RobotPrincessFi"}, 423 | ... # the url that tweet directs to, often t.co 424 | ... 'url': "t.co/1234"}] 425 | """ 426 | return tweet_links.get_tweet_links(self) 427 | 428 | @lazy_property 429 | def most_unrolled_urls(self): 430 | """ 431 | For each url included in the Tweet "urls", get the most unrolled 432 | version available. Only return 1 url string per url in tweet.tweet_links 433 | In order of preference for "most unrolled" 434 | (keys from the dict at tweet.tweet_links): \n 435 | 1. `unwound`/`url` \n 436 | 2. `expanded_url` \n 437 | 3. `url` 438 | 439 | Returns: 440 | list (a list of strings): list of urls 441 | value returned by calling tweet_links.get_most_unrolled_urls on `self` 442 | """ 443 | return tweet_links.get_most_unrolled_urls(self) 444 | 445 | @lazy_property 446 | def user_mentions(self): 447 | """ 448 | The @-mentions in the Tweet as dictionaries. 449 | Note that in the case of a quote-tweet, this does not return the users 450 | mentioned in the quoted status. The recommended way to get that list 451 | would be to use 'tweet.quoted_tweet.user_mentions'. 452 | Also note that in the caes of a quote-tweet, the list of @-mentioned 453 | users does not include the user who authored the original (quoted) Tweet, 454 | you can get the author of the quoted tweet using 455 | `tweet.quoted_tweet.user_id` 456 | 457 | Returns: 458 | list (list of dicts): 1 item per @ mention, 459 | value returned by calling `tweet_entities.get_user_mentions` on `self` 460 | 461 | Example: 462 | >>> result = { 463 | ... #characters where the @ mention appears 464 | ... "indices": [14,26], 465 | ... #id of @ mentioned user as a string 466 | ... "id_str": "2382763597", 467 | ... #screen_name of @ mentioned user 468 | ... "screen_name": "notFromShrek", 469 | ... #display name of @ mentioned user 470 | ... "name": "Fiona", 471 | ... #id of @ mentioned user as an int 472 | ... "id": 2382763597 473 | ... } 474 | 475 | """ 476 | return tweet_entities.get_user_mentions(self) 477 | 478 | @lazy_property 479 | def hashtags(self): 480 | """ 481 | A list of hashtags in the Tweet. 482 | Note that in the case of a quote-tweet, this does not return the 483 | hashtags in the quoted status. The recommended way to get that list 484 | would be to use `tweet.quoted_tweet.hashtags` 485 | 486 | Returns: 487 | list (a list of strings): list of all of the hashtags in the Tweet 488 | value returned by calling `tweet_entities.get_hashtags` on `self` 489 | """ 490 | return tweet_entities.get_hashtags(self) 491 | 492 | @lazy_property 493 | def media_urls(self): 494 | """ 495 | A list of all media (https) urls in the tweet, useful for grabbing 496 | photo/video urls for other purposes. 497 | 498 | Returns: 499 | list (a list of strings): list of all of the media urls in the Tweet 500 | value returned by calling `tweet_entities.get_media_urls` on `self` 501 | """ 502 | return tweet_entities.get_media_urls(self) 503 | 504 | @lazy_property 505 | def quoted_tweet(self): 506 | """ 507 | The quoted Tweet as a Tweet object 508 | If the Tweet is not a quote Tweet, return None 509 | If the quoted Tweet payload cannot be loaded as a Tweet, this will 510 | raise a "NotATweetError" 511 | 512 | Returns: 513 | Tweet: A Tweet representing the quoted status (or None) 514 | (see tweet_embeds.get_quote_tweet, this is that value as a Tweet) 515 | 516 | Raises: 517 | NotATweetError: if quoted tweet is malformed 518 | """ 519 | quote_tweet = tweet_embeds.get_quoted_tweet(self) 520 | if quote_tweet is not None: 521 | try: 522 | return Tweet(quote_tweet) 523 | except NotATweetError as nate: 524 | raise(NotATweetError("The quote-tweet payload appears malformed." + 525 | " Failed with '{}'".format(nate))) 526 | else: 527 | return None 528 | 529 | @lazy_property 530 | def retweeted_tweet(self): 531 | """ 532 | The retweeted Tweet as a Tweet object 533 | If the Tweet is not a Retweet, return None 534 | If the Retweet payload cannot be loaded as a Tweet, this will 535 | raise a `NotATweetError` 536 | 537 | Returns: 538 | Tweet: A Tweet representing the retweeted status (or None) 539 | (see tweet_embeds.get_retweet, this is that value as a Tweet) 540 | 541 | Raises: 542 | NotATweetError: if retweeted tweet is malformed 543 | """ 544 | retweet = tweet_embeds.get_retweeted_tweet(self) 545 | if retweet is not None: 546 | try: 547 | return Tweet(retweet) 548 | except NotATweetError as nate: 549 | raise(NotATweetError("The retweet payload appears malformed." + 550 | " Failed with '{}'".format(nate))) 551 | else: 552 | return None 553 | 554 | @lazy_property 555 | def embedded_tweet(self): 556 | """ 557 | Get the retweeted Tweet OR the quoted Tweet and return it as a Tweet object 558 | 559 | Returns: 560 | Tweet (or None, if the Tweet is neither a quote tweet or a Retweet): 561 | a Tweet representing the quote Tweet or the Retweet 562 | (see tweet_embeds.get_embedded_tweet, this is that value as a Tweet) 563 | 564 | Raises: 565 | NotATweetError: if embedded tweet is malformed 566 | """ 567 | embedded_tweet = tweet_embeds.get_embedded_tweet(self) 568 | if embedded_tweet is not None: 569 | try: 570 | return Tweet(embedded_tweet) 571 | except NotATweetError as nate: 572 | raise(NotATweetError("The embedded tweet payload {} appears malformed." + 573 | " Failed with '{}'".format(embedded_tweet, nate))) 574 | else: 575 | return None 576 | 577 | @lazy_property 578 | def gnip_matching_rules(self): 579 | """ 580 | Get the Gnip tagged rules that this tweet matched. 581 | 582 | Returns: 583 | List of potential tags with the matching rule or None if no rules 584 | are defined. 585 | 586 | """ 587 | return gnip_fields.get_matching_rules(self) 588 | 589 | @lazy_property 590 | def generator(self): 591 | """ 592 | Get information about the application that generated the Tweet 593 | 594 | Returns: 595 | dict: keys are 'link' and 'name', the link to and name of the application 596 | that generated the Tweet. 597 | value returned by calling `tweet_generator.get_generator` on `self` 598 | """ 599 | return tweet_generator.get_generator(self) 600 | 601 | @lazy_property 602 | def in_reply_to_screen_name(self): 603 | """ 604 | The screen name of the user being replied to (None if the Tweet isn't a reply) 605 | 606 | Returns: 607 | str: value returned by calling `tweet_reply.get_in_reply_to_screen_name` on `self` 608 | """ 609 | return tweet_reply.get_in_reply_to_screen_name(self) 610 | 611 | @lazy_property 612 | def in_reply_to_user_id(self): 613 | """ 614 | The user id of the user being replied to (None if the Tweet isn't a reply). 615 | This raises a NotAvailableError for activity-streams format 616 | 617 | Returns: 618 | str: value returned by calling `tweet_reply.get_in_reply_to_user_id` on `self` 619 | """ 620 | return tweet_reply.get_in_reply_to_user_id(self) 621 | 622 | @lazy_property 623 | def in_reply_to_status_id(self): 624 | """ 625 | The status id of the Tweet being replied to (None if the Tweet isn't a reply) 626 | 627 | Returns: 628 | str: value returned by calling `tweet_reply.get_in_reply_to_status_id` on `self` 629 | """ 630 | return tweet_reply.get_in_reply_to_status_id(self) 631 | 632 | @lazy_property 633 | def favorite_count(self): 634 | """ 635 | The number of favorites that this tweet has received *at the time of 636 | retrieval*. If a tweet is obtained from a live stream, this will likely 637 | be 0. 638 | 639 | Returns: 640 | int: value returned by calling `tweet_counts.get_favorite_count` on `self` 641 | """ 642 | return tweet_counts.get_favorite_count(self) 643 | 644 | @lazy_property 645 | def quote_count(self): 646 | """ 647 | The number of tweets that this tweet has been quoted in *at the time of 648 | retrieval*. If a tweet is obtained from a live stream, this will likely 649 | be 0. 650 | This raises a NotAvailableError for activity-streams format 651 | 652 | Returns: 653 | int: value returned by calling `tweet_counts.get_quote_count` on `self` 654 | or raises NotAvailableError 655 | """ 656 | return tweet_counts.get_quote_count(self) 657 | 658 | @lazy_property 659 | def retweet_count(self): 660 | """ 661 | The number of times this tweet has been retweeted *at the time of 662 | retrieval*. If a tweet is obtained from a live stream, this will likely 663 | be 0. 664 | 665 | Returns: 666 | int: value returned by calling `tweet_counts.get_retweet_count` on `self` 667 | """ 668 | return tweet_counts.get_retweet_count(self) 669 | -------------------------------------------------------------------------------- /tweet_parser/tweet_checking.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | """Validation and checking methods for Tweets. 6 | 7 | Methods here are primarily used by other methods within this module but can be 8 | used for other validation code as well. 9 | """ 10 | 11 | from tweet_parser.tweet_parser_errors import NotATweetError, UnexpectedFormatError 12 | from tweet_parser.tweet_keys import original_format_minimum_set_keys 13 | from tweet_parser.tweet_keys import activity_streams_minimum_set_keys 14 | from tweet_parser.tweet_keys import original_format_superset_keys, activity_streams_superset_keys 15 | 16 | 17 | def is_original_format(tweet): 18 | """ 19 | Simple checker to flag the format of a tweet. 20 | 21 | Args: 22 | tweet (Tweet): tweet in qustion 23 | 24 | Returns: 25 | Bool 26 | 27 | Example: 28 | >>> import tweet_parser.tweet_checking as tc 29 | >>> tweet = {"created_at": 124125125125, 30 | ... "text": "just setting up my twttr", 31 | ... "nested_field": {"nested_1": "field", "nested_2": "field2"}} 32 | >>> tc.is_original_format(tweet) 33 | True 34 | """ 35 | # deleted due to excess checking; it's a key lookup and does not need any 36 | # operational optimization 37 | if "created_at" in tweet: 38 | original_format = True 39 | elif "postedTime" in tweet: 40 | original_format = False 41 | else: 42 | raise NotATweetError("This dict has neither 'created_at' or 'postedTime' as keys") 43 | return original_format 44 | 45 | 46 | def get_all_keys(tweet, parent_key=''): 47 | """ 48 | Takes a tweet object and recursively returns a list of all keys contained 49 | in this level and all nexstted levels of the tweet. 50 | 51 | Args: 52 | tweet (Tweet): the tweet dict 53 | parent_key (str): key from which this process will start, e.g., you can 54 | get keys only under some key that is not the top-level key. 55 | 56 | Returns: 57 | list of all keys in nested dicts. 58 | 59 | Example: 60 | >>> import tweet_parser.tweet_checking as tc 61 | >>> tweet = {"created_at": 124125125125, "text": "just setting up my twttr", 62 | ... "nested_field": {"nested_1": "field", "nested_2": "field2"}} 63 | >>> tc.get_all_keys(tweet) 64 | ['created_at', 'text', 'nested_field nested_1', 'nested_field nested_2'] 65 | """ 66 | items = [] 67 | for k, v in tweet.items(): 68 | new_key = parent_key + " " + k 69 | if isinstance(v, dict): 70 | items.extend(get_all_keys(v, parent_key=new_key)) 71 | else: 72 | items.append(new_key.strip(" ")) 73 | return items 74 | 75 | 76 | def key_validation_check(tweet_keys_list, superset_keys, minset_keys): 77 | """ 78 | Validates the keys present in a Tweet. 79 | 80 | Args: 81 | tweet_keys_list (list): the keys present in a tweet 82 | superset_keys (set): the set of all possible keys for a tweet 83 | minset_keys (set): the set of minimal keys expected in a tweet. 84 | 85 | Returns: 86 | 0 if no errors 87 | 88 | Raises: 89 | UnexpectedFormatError on any mismatch of keys. 90 | """ 91 | # check for keys that must be present 92 | tweet_keys = set(tweet_keys_list) 93 | minset_overlap = tweet_keys & minset_keys 94 | if minset_overlap != minset_keys: 95 | raise UnexpectedFormatError("keys ({}) missing from Tweet (Public API data is not supported)" 96 | .format(minset_keys - tweet_keys)) 97 | # check for keys that could be present 98 | unexpected_keys = tweet_keys - superset_keys 99 | if len(unexpected_keys) > 0: 100 | raise UnexpectedFormatError("Unexpected keys ({}) are in this Tweet" 101 | .format(unexpected_keys)) 102 | return 0 103 | 104 | 105 | 106 | def _check_original_format_tweet(tweet, validation_checking=False): 107 | for key in ["user", "text"]: 108 | if key not in tweet: 109 | raise NotATweetError("This dict has no '{}' key".format(key)) 110 | # check for changing keys 111 | if validation_checking: 112 | _ = key_validation_check(get_all_keys(tweet), 113 | original_format_superset_keys, 114 | original_format_minimum_set_keys) 115 | 116 | 117 | def _check_activity_streams_tweet(tweet, validation_checking=False): 118 | for key in ["actor", "body"]: 119 | if key not in tweet: 120 | raise NotATweetError("This dict has no '{}' key".format(key)) 121 | # check for changing keys 122 | if validation_checking: 123 | _ = key_validation_check(get_all_keys(tweet), 124 | activity_streams_superset_keys, 125 | activity_streams_minimum_set_keys) 126 | 127 | 128 | 129 | def check_tweet(tweet, validation_checking=False): 130 | """ 131 | Ensures a tweet is valid and determines the type of format for the tweet. 132 | 133 | Args: 134 | tweet (dict/Tweet): the tweet payload 135 | validation_checking (bool): check for valid key structure in a tweet. 136 | """ 137 | 138 | if "id" not in tweet: 139 | raise NotATweetError("This text has no 'id' key") 140 | 141 | original_format = is_original_format(tweet) 142 | 143 | if original_format: 144 | _check_original_format_tweet(tweet, validation_checking=validation_checking) 145 | else: 146 | _check_activity_streams_tweet(tweet, validation_checking=validation_checking) 147 | 148 | return original_format 149 | -------------------------------------------------------------------------------- /tweet_parser/tweet_keys.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | original_format_superset_keys = { 6 | 'contributors', 7 | 'coordinates', 8 | 'created_at', 9 | 'display_text_range', 10 | 'entities hashtags', 11 | 'entities media', 12 | 'entities polls', 13 | 'entities symbols', 14 | 'entities urls', 15 | 'entities user_mentions', 16 | 'extended_entities media', 17 | 'extended_tweet display_text_range', 18 | 'extended_tweet entities hashtags', 19 | 'extended_tweet entities media', 20 | 'extended_tweet entities symbols', 21 | 'extended_tweet entities urls', 22 | 'extended_tweet entities user_mentions', 23 | 'extended_tweet extended_entities media', 24 | 'extended_tweet full_text', 25 | 'favorite_count', 26 | 'favorited', 27 | 'filter_level', 28 | 'geo', 29 | 'id', 30 | 'id_str', 31 | 'in_reply_to_screen_name', 32 | 'in_reply_to_status_id', 33 | 'in_reply_to_status_id_str', 34 | 'in_reply_to_user_id', 35 | 'in_reply_to_user_id_str', 36 | 'is_quote_status', 37 | 'lang', 38 | 'matching_rules', 39 | 'place', 40 | 'place bounding_box coordinates', 41 | 'place bounding_box type', 42 | 'place country', 43 | 'place country_code', 44 | 'place full_name', 45 | 'place id', 46 | 'place name', 47 | 'place place_type', 48 | 'place url', 49 | 'possibly_sensitive', 50 | 'quote_count', 51 | 'quoted_status contributors', 52 | 'quoted_status coordinates', 53 | 'quoted_status created_at', 54 | 'quoted_status display_text_range', 55 | 'quoted_status entities hashtags', 56 | 'quoted_status entities symbols', 57 | 'quoted_status entities urls', 58 | 'quoted_status entities user_mentions', 59 | 'quoted_status extended_tweet display_text_range', 60 | 'quoted_status extended_tweet entities hashtags', 61 | 'quoted_status extended_tweet entities media', 62 | 'quoted_status extended_tweet entities symbols', 63 | 'quoted_status extended_tweet entities urls', 64 | 'quoted_status extended_tweet entities user_mentions', 65 | 'quoted_status extended_tweet extended_entities media', 66 | 'quoted_status extended_tweet full_text', 67 | 'quoted_status favorite_count', 68 | 'quoted_status favorited', 69 | 'quoted_status filter_level', 70 | 'quoted_status geo', 71 | 'quoted_status id', 72 | 'quoted_status id_str', 73 | 'quoted_status in_reply_to_screen_name', 74 | 'quoted_status in_reply_to_status_id', 75 | 'quoted_status in_reply_to_status_id_str', 76 | 'quoted_status in_reply_to_user_id', 77 | 'quoted_status in_reply_to_user_id_str', 78 | 'quoted_status is_quote_status', 79 | 'quoted_status lang', 80 | 'quoted_status place', 81 | 'quoted_status place bounding_box coordinates', 82 | 'quoted_status place bounding_box type', 83 | 'quoted_status place country', 84 | 'quoted_status place country_code', 85 | 'quoted_status place full_name', 86 | 'quoted_status place id', 87 | 'quoted_status place name', 88 | 'quoted_status place place_type', 89 | 'quoted_status place url', 90 | 'quoted_status possibly_sensitive', 91 | 'quoted_status quote_count', 92 | 'quoted_status quoted_status_id', 93 | 'quoted_status quoted_status_id_str', 94 | 'quoted_status reply_count', 95 | 'quoted_status retweet_count', 96 | 'quoted_status retweeted', 97 | 'quoted_status source', 98 | 'quoted_status text', 99 | 'quoted_status truncated', 100 | 'quoted_status user contributors_enabled', 101 | 'quoted_status user created_at', 102 | 'quoted_status user default_profile', 103 | 'quoted_status user default_profile_image', 104 | 'quoted_status user description', 105 | 'quoted_status user favourites_count', 106 | 'quoted_status user follow_request_sent', 107 | 'quoted_status user followers_count', 108 | 'quoted_status user following', 109 | 'quoted_status user friends_count', 110 | 'quoted_status user geo_enabled', 111 | 'quoted_status user id', 112 | 'quoted_status user id_str', 113 | 'quoted_status user is_translator', 114 | 'quoted_status user lang', 115 | 'quoted_status user listed_count', 116 | 'quoted_status user location', 117 | 'quoted_status user name', 118 | 'quoted_status user notifications', 119 | 'quoted_status user profile_background_color', 120 | 'quoted_status user profile_background_image_url', 121 | 'quoted_status user profile_background_image_url_https', 122 | 'quoted_status user profile_background_tile', 123 | 'quoted_status user profile_banner_url', 124 | 'quoted_status user profile_image_url', 125 | 'quoted_status user profile_image_url_https', 126 | 'quoted_status user profile_link_color', 127 | 'quoted_status user profile_sidebar_border_color', 128 | 'quoted_status user profile_sidebar_fill_color', 129 | 'quoted_status user profile_text_color', 130 | 'quoted_status user profile_use_background_image', 131 | 'quoted_status user protected', 132 | 'quoted_status user screen_name', 133 | 'quoted_status user statuses_count', 134 | 'quoted_status user time_zone', 135 | 'quoted_status user translator_type', 136 | 'quoted_status user url', 137 | 'quoted_status user verified', 138 | 'quoted_status_id', 139 | 'quoted_status_id_str', 140 | 'reply_count', 141 | 'retweet_count', 142 | 'retweeted', 143 | 'retweeted_status contributors', 144 | 'retweeted_status coordinates', 145 | 'retweeted_status created_at', 146 | 'retweeted_status display_text_range', 147 | 'retweeted_status entities hashtags', 148 | 'retweeted_status entities symbols', 149 | 'retweeted_status entities urls', 150 | 'retweeted_status entities user_mentions', 151 | 'retweeted_status extended_tweet display_text_range', 152 | 'retweeted_status extended_tweet entities hashtags', 153 | 'retweeted_status extended_tweet entities media', 154 | 'retweeted_status extended_tweet entities symbols', 155 | 'retweeted_status extended_tweet entities urls', 156 | 'retweeted_status extended_tweet entities user_mentions', 157 | 'retweeted_status extended_tweet extended_entities media', 158 | 'retweeted_status extended_tweet full_text', 159 | 'retweeted_status favorite_count', 160 | 'retweeted_status favorited', 161 | 'retweeted_status filter_level', 162 | 'retweeted_status geo', 163 | 'retweeted_status id', 164 | 'retweeted_status id_str', 165 | 'retweeted_status in_reply_to_screen_name', 166 | 'retweeted_status in_reply_to_status_id', 167 | 'retweeted_status in_reply_to_status_id_str', 168 | 'retweeted_status in_reply_to_user_id', 169 | 'retweeted_status in_reply_to_user_id_str', 170 | 'retweeted_status is_quote_status', 171 | 'retweeted_status lang', 172 | 'retweeted_status place bounding_box coordinates', 173 | 'retweeted_status place bounding_box type', 174 | 'retweeted_status place country', 175 | 'retweeted_status place country_code', 176 | 'retweeted_status place full_name', 177 | 'retweeted_status place id', 178 | 'retweeted_status place name', 179 | 'retweeted_status place place_type', 180 | 'retweeted_status place url', 181 | 'retweeted_status possibly_sensitive', 182 | 'retweeted_status quote_count', 183 | 'retweeted_status reply_count', 184 | 'retweeted_status retweet_count', 185 | 'retweeted_status retweeted', 186 | 'retweeted_status source', 187 | 'retweeted_status text', 188 | 'retweeted_status truncated', 189 | 'retweeted_status user contributors_enabled', 190 | 'retweeted_status user created_at', 191 | 'retweeted_status user default_profile', 192 | 'retweeted_status user default_profile_image', 193 | 'retweeted_status user description', 194 | 'retweeted_status user favourites_count', 195 | 'retweeted_status user follow_request_sent', 196 | 'retweeted_status user followers_count', 197 | 'retweeted_status user following', 198 | 'retweeted_status user friends_count', 199 | 'retweeted_status user geo_enabled', 200 | 'retweeted_status user id', 201 | 'retweeted_status user id_str', 202 | 'retweeted_status user is_translator', 203 | 'retweeted_status user lang', 204 | 'retweeted_status user listed_count', 205 | 'retweeted_status user location', 206 | 'retweeted_status user name', 207 | 'retweeted_status user notifications', 208 | 'retweeted_status user profile_background_color', 209 | 'retweeted_status user profile_background_image_url', 210 | 'retweeted_status user profile_background_image_url_https', 211 | 'retweeted_status user profile_background_tile', 212 | 'retweeted_status user profile_banner_url', 213 | 'retweeted_status user profile_image_url', 214 | 'retweeted_status user profile_image_url_https', 215 | 'retweeted_status user profile_link_color', 216 | 'retweeted_status user profile_sidebar_border_color', 217 | 'retweeted_status user profile_sidebar_fill_color', 218 | 'retweeted_status user profile_text_color', 219 | 'retweeted_status user profile_use_background_image', 220 | 'retweeted_status user protected', 221 | 'retweeted_status user screen_name', 222 | 'retweeted_status user statuses_count', 223 | 'retweeted_status user time_zone', 224 | 'retweeted_status user translator_type', 225 | 'retweeted_status user url', 226 | 'retweeted_status user verified', 227 | 'source', 228 | 'text', 229 | 'truncated', 230 | 'user contributors_enabled', 231 | 'user created_at', 232 | 'user default_profile', 233 | 'user default_profile_image', 234 | 'user derived klout influence_topics', 235 | 'user derived klout interest_topics', 236 | 'user derived klout profile_url', 237 | 'user derived klout score', 238 | 'user derived klout user_id', 239 | 'user description', 240 | 'user favourites_count', 241 | 'user follow_request_sent', 242 | 'user followers_count', 243 | 'user following', 244 | 'user friends_count', 245 | 'user geo_enabled', 246 | 'user id', 247 | 'user id_str', 248 | 'user is_translator', 249 | 'user lang', 250 | 'user listed_count', 251 | 'user location', 252 | 'user name', 253 | 'user notifications', 254 | 'user profile_background_color', 255 | 'user profile_background_image_url', 256 | 'user profile_background_image_url_https', 257 | 'user profile_background_tile', 258 | 'user profile_image_url', 259 | 'user profile_image_url_https', 260 | 'user profile_link_color', 261 | 'user profile_sidebar_border_color', 262 | 'user profile_sidebar_fill_color', 263 | 'user profile_text_color', 264 | 'user profile_use_background_image', 265 | 'user protected', 266 | 'user screen_name', 267 | 'user statuses_count', 268 | 'user time_zone', 269 | 'user translator_type', 270 | 'user url', 271 | 'user utc_offset', 272 | 'user verified', 273 | 'scopes followers', #no idea what the hell this key is 274 | 'geo', 275 | 'coordinates', 276 | 'retweeted_status place', 277 | 'user profile_banner_url', 278 | 'retweeted_status extended_entities media', 279 | 'retweeted_status entities media', 280 | 'user derived locations', 281 | 'retweeted_status quoted_status in_reply_to_status_id_str', 282 | 'retweeted_status quoted_status user default_profile', 283 | 'retweeted_status quoted_status user profile_background_color', 284 | 'retweeted_status quoted_status truncated', 285 | 'retweeted_status quoted_status user profile_background_image_url', 286 | 'retweeted_status quoted_status user followers_count', 287 | 'retweeted_status quoted_status user id', 288 | 'retweeted_status quoted_status_id', 289 | 'retweeted_status quoted_status lang', 290 | 'retweeted_status quoted_status in_reply_to_user_id', 291 | 'retweeted_status quoted_status user protected', 292 | 'retweeted_status quoted_status user profile_use_background_image', 293 | 'retweeted_status quoted_status user profile_image_url', 294 | 'retweeted_status quoted_status user is_translator', 295 | 'retweeted_status quoted_status text', 296 | 'retweeted_status quoted_status in_reply_to_status_id', 297 | 'retweeted_status quoted_status id', 298 | 'retweeted_status quoted_status user favourites_count', 299 | 'retweeted_status quoted_status coordinates', 300 | 'retweeted_status quoted_status user description', 301 | 'retweeted_status quoted_status source', 302 | 'retweeted_status quoted_status user profile_image_url_https', 303 | 'retweeted_status quoted_status contributors', 304 | 'retweeted_status quoted_status user follow_request_sent', 305 | 'retweeted_status quoted_status created_at', 306 | 'retweeted_status quoted_status entities user_mentions', 307 | 'retweeted_status quoted_status user profile_sidebar_fill_color', 308 | 'retweeted_status quoted_status quote_count', 309 | 'retweeted_status quoted_status user following', 310 | 'retweeted_status quoted_status user profile_banner_url', 311 | 'retweeted_status quoted_status reply_count', 312 | 'retweeted_status quoted_status user name', 313 | 'retweeted_status quoted_status user profile_background_image_url_https', 314 | 'retweeted_status quoted_status entities symbols', 315 | 'retweeted_status quoted_status retweet_count', 316 | 'retweeted_status quoted_status user id_str', 317 | 'retweeted_status quoted_status retweeted', 318 | 'retweeted_status quoted_status user created_at', 319 | 'retweeted_status quoted_status place', 320 | 'retweeted_status quoted_status user friends_count', 321 | 'retweeted_status quoted_status user location', 322 | 'retweeted_status quoted_status user listed_count', 323 | 'retweeted_status quoted_status is_quote_status', 324 | 'retweeted_status quoted_status in_reply_to_user_id_str', 325 | 'retweeted_status quoted_status_id_str', 326 | 'retweeted_status quoted_status user screen_name', 327 | 'retweeted_status quoted_status user profile_sidebar_border_color', 328 | 'retweeted_status quoted_status user default_profile_image', 329 | 'retweeted_status quoted_status user utc_offset', 330 | 'retweeted_status quoted_status favorited', 331 | 'retweeted_status quoted_status user verified', 332 | 'retweeted_status quoted_status user profile_background_tile', 333 | 'retweeted_status quoted_status user translator_type', 334 | 'retweeted_status quoted_status user profile_text_color', 335 | 'retweeted_status quoted_status in_reply_to_screen_name', 336 | 'retweeted_status quoted_status user notifications', 337 | 'retweeted_status quoted_status user url', 338 | 'retweeted_status quoted_status id_str', 339 | 'retweeted_status quoted_status entities hashtags', 340 | 'retweeted_status quoted_status favorite_count', 341 | 'retweeted_status quoted_status geo', 342 | 'retweeted_status quoted_status user lang', 343 | 'retweeted_status quoted_status user geo_enabled', 344 | 'retweeted_status quoted_status user profile_link_color', 345 | 'retweeted_status quoted_status filter_level', 346 | 'retweeted_status quoted_status user contributors_enabled', 347 | 'retweeted_status quoted_status entities urls', 348 | 'retweeted_status quoted_status user statuses_count', 349 | 'retweeted_status quoted_status user time_zone', 350 | 'quoted_status entities media', 351 | 'retweeted_status quoted_status entities media', 352 | 'retweeted_status quoted_status display_text_range', 353 | 'retweeted_status quoted_status extended_entities media', 354 | 'quoted_status extended_entities media', 355 | 'retweeted_status quoted_status possibly_sensitive', 356 | 'user derived klout influence_topics', 357 | 'user derived klout user_id', 358 | 'user derived klout profile_url', 359 | 'user derived klout interest_topics', 360 | 'user derived klout score', 361 | 'retweeted_status quoted_status place country', 362 | 'retweeted_status quoted_status place url', 363 | 'retweeted_status quoted_status place full_name', 364 | 'retweeted_status quoted_status extended_tweet entities urls', 365 | 'retweeted_status quoted_status extended_tweet entities symbols', 366 | 'retweeted_status quoted_status extended_tweet entities hashtags', 367 | 'retweeted_status quoted_status extended_tweet full_text', 368 | 'retweeted_status quoted_status place id', 369 | 'retweeted_status quoted_status place name', 370 | 'retweeted_status quoted_status extended_tweet display_text_range', 371 | 'retweeted_status quoted_status extended_tweet extended_entities media', 372 | 'retweeted_status quoted_status extended_tweet entities media', 373 | 'retweeted_status quoted_status extended_tweet entities user_mentions', 374 | 'retweeted_status quoted_status place bounding_box coordinates', 375 | 'retweeted_status quoted_status place bounding_box type', 376 | 'retweeted_status quoted_status place place_type', 377 | 'retweeted_status quoted_status place country_code', 378 | 'coordinates type', 379 | 'geo type', 380 | 'coordinates coordinates', 381 | 'geo coordinates', 382 | 'retweeted_status scopes followers', 383 | 'retweeted_status coordinates type', 384 | 'retweeted_status geo type', 385 | 'retweeted_status coordinates coordinates', 386 | 'retweeted_status geo coordinates', 387 | 'retweeted_status entities polls', 388 | 'retweeted_status quoted_status quoted_status_id', 389 | 'retweeted_status quoted_status quoted_status_id_str', 390 | 'quoted_status entities polls', 391 | 'retweeted_status quoted_status entities polls', 392 | 'quoted_status coordinates coordinates', 393 | 'quoted_status geo coordinates', 394 | 'quoted_status geo type', 395 | 'quoted_status coordinates type', 396 | 'quoted_status user withheld_in_countries', 397 | 'retweeted_status quoted_status user withheld_in_countries', 398 | 'retweeted_status quoted_status withheld_in_countries', 399 | 'quoted_status withheld_in_countries', 400 | 'quoted_status scopes followers' 401 | } 402 | 403 | original_format_minimum_set_keys = { 404 | 'contributors', 405 | 'created_at', 406 | 'entities hashtags', 407 | 'entities symbols', 408 | 'entities urls', 409 | 'entities user_mentions', 410 | 'favorite_count', 411 | 'favorited', 412 | 'filter_level', 413 | 'id', 414 | 'id_str', 415 | 'in_reply_to_screen_name', 416 | 'in_reply_to_status_id', 417 | 'in_reply_to_status_id_str', 418 | 'in_reply_to_user_id', 419 | 'in_reply_to_user_id_str', 420 | 'is_quote_status', 421 | 'lang', 422 | 'matching_rules', 423 | 'quote_count', 424 | 'reply_count', 425 | 'retweet_count', 426 | 'retweeted', 427 | 'source', 428 | 'text', 429 | 'truncated', 430 | 'user contributors_enabled', 431 | 'user created_at', 432 | 'user default_profile', 433 | 'user default_profile_image', 434 | 'user description', 435 | 'user favourites_count', 436 | 'user follow_request_sent', 437 | 'user followers_count', 438 | 'user following', 439 | 'user friends_count', 440 | 'user geo_enabled', 441 | 'user id', 442 | 'user id_str', 443 | 'user is_translator', 444 | 'user lang', 445 | 'user listed_count', 446 | 'user location', 447 | 'user name', 448 | 'user notifications', 449 | 'user profile_background_color', 450 | 'user profile_background_image_url', 451 | 'user profile_background_image_url_https', 452 | 'user profile_background_tile', 453 | 'user profile_image_url', 454 | 'user profile_image_url_https', 455 | 'user profile_link_color', 456 | 'user profile_sidebar_border_color', 457 | 'user profile_sidebar_fill_color', 458 | 'user profile_text_color', 459 | 'user profile_use_background_image', 460 | 'user protected', 461 | 'user screen_name', 462 | 'user statuses_count', 463 | 'user time_zone', 464 | 'user translator_type', 465 | 'user url', 466 | 'user utc_offset', 467 | 'user verified'} 468 | 469 | 470 | activity_streams_superset_keys = { 471 | 'actor displayName', 472 | 'actor favoritesCount', 473 | 'actor followersCount', 474 | 'actor friendsCount', 475 | 'actor id', 476 | 'actor image', 477 | 'actor languages', 478 | 'actor link', 479 | 'actor links', 480 | 'actor listedCount', 481 | 'actor objectType', 482 | 'actor postedTime', 483 | 'actor preferredUsername', 484 | 'actor statusesCount', 485 | 'actor summary', 486 | 'actor twitterTimeZone', 487 | 'actor utcOffset', 488 | 'actor verified', 489 | 'body', 490 | 'display_text_range', 491 | 'favoritesCount', 492 | 'generator displayName', 493 | 'generator link', 494 | 'gnip klout_profile klout_user_id', 495 | 'gnip klout_profile link', 496 | 'gnip klout_profile topics', 497 | 'gnip klout_score', 498 | 'gnip matching_rules', 499 | 'gnip urls', 500 | 'id', 501 | 'inReplyTo link', 502 | 'link', 503 | 'location country_code', 504 | 'location displayName', 505 | 'location geo coordinates', 506 | 'location geo type', 507 | 'location link', 508 | 'location name', 509 | 'location objectType', 510 | 'location twitter_country_code', 511 | 'location twitter_place_type', 512 | 'long_object body', 513 | 'long_object display_text_range', 514 | 'long_object twitter_entities hashtags', 515 | 'long_object twitter_entities media', 516 | 'long_object twitter_entities symbols', 517 | 'long_object twitter_entities urls', 518 | 'long_object twitter_entities user_mentions', 519 | 'long_object twitter_extended_entities media', 520 | 'object actor displayName', 521 | 'object actor favoritesCount', 522 | 'object actor followersCount', 523 | 'object actor friendsCount', 524 | 'object actor id', 525 | 'object actor image', 526 | 'object actor languages', 527 | 'object actor link', 528 | 'object actor links', 529 | 'object actor listedCount', 530 | 'object actor location displayName', 531 | 'object actor location objectType', 532 | 'object actor objectType', 533 | 'object actor postedTime', 534 | 'object actor preferredUsername', 535 | 'object actor statusesCount', 536 | 'object actor summary', 537 | 'object actor twitterTimeZone', 538 | 'object actor utcOffset', 539 | 'object actor verified', 540 | 'object body', 541 | 'object display_text_range', 542 | 'object favoritesCount', 543 | 'object generator displayName', 544 | 'object generator link', 545 | 'object id', 546 | 'object inReplyTo link', 547 | 'object link', 548 | 'object location country_code', 549 | 'object location displayName', 550 | 'object location geo coordinates', 551 | 'object location geo type', 552 | 'object location link', 553 | 'object location name', 554 | 'object location objectType', 555 | 'object location twitter_country_code', 556 | 'object location twitter_place_type', 557 | 'object long_object body', 558 | 'object long_object display_text_range', 559 | 'object long_object twitter_entities hashtags', 560 | 'object long_object twitter_entities media', 561 | 'object long_object twitter_entities symbols', 562 | 'object long_object twitter_entities urls', 563 | 'object long_object twitter_entities user_mentions', 564 | 'object long_object twitter_extended_entities media', 565 | 'object object id', 566 | 'object object link', 567 | 'object object objectType', 568 | 'object object postedTime', 569 | 'object object summary', 570 | 'object objectType', 571 | 'object postedTime', 572 | 'object provider displayName', 573 | 'object provider link', 574 | 'object provider objectType', 575 | 'object summary', 576 | 'object twitter_entities hashtags', 577 | 'object twitter_entities symbols', 578 | 'object twitter_entities urls', 579 | 'object twitter_entities user_mentions', 580 | 'object twitter_filter_level', 581 | 'object twitter_lang', 582 | 'object verb', 583 | 'objectType', 584 | 'postedTime', 585 | 'provider displayName', 586 | 'provider link', 587 | 'provider objectType', 588 | 'retweetCount', 589 | 'twitter_entities hashtags', 590 | 'twitter_entities media', 591 | 'twitter_entities symbols', 592 | 'twitter_entities urls', 593 | 'twitter_entities user_mentions', 594 | 'twitter_extended_entities media', 595 | 'twitter_filter_level', 596 | 'twitter_lang', 597 | 'twitter_quoted_status actor displayName', 598 | 'twitter_quoted_status actor favoritesCount', 599 | 'twitter_quoted_status actor followersCount', 600 | 'twitter_quoted_status actor friendsCount', 601 | 'twitter_quoted_status actor id', 602 | 'twitter_quoted_status actor image', 603 | 'twitter_quoted_status actor languages', 604 | 'twitter_quoted_status actor link', 605 | 'twitter_quoted_status actor links', 606 | 'twitter_quoted_status actor listedCount', 607 | 'twitter_quoted_status actor location displayName', 608 | 'twitter_quoted_status actor location objectType', 609 | 'twitter_quoted_status actor objectType', 610 | 'twitter_quoted_status actor postedTime', 611 | 'twitter_quoted_status actor preferredUsername', 612 | 'twitter_quoted_status actor statusesCount', 613 | 'twitter_quoted_status actor summary', 614 | 'twitter_quoted_status actor twitterTimeZone', 615 | 'twitter_quoted_status actor utcOffset', 616 | 'twitter_quoted_status actor verified', 617 | 'twitter_quoted_status body', 618 | 'twitter_quoted_status display_text_range', 619 | 'twitter_quoted_status favoritesCount', 620 | 'twitter_quoted_status generator displayName', 621 | 'twitter_quoted_status generator link', 622 | 'twitter_quoted_status id', 623 | 'twitter_quoted_status inReplyTo link', 624 | 'twitter_quoted_status link', 625 | 'twitter_quoted_status location country_code', 626 | 'twitter_quoted_status location displayName', 627 | 'twitter_quoted_status location geo coordinates', 628 | 'twitter_quoted_status location geo type', 629 | 'twitter_quoted_status location link', 630 | 'twitter_quoted_status location name', 631 | 'twitter_quoted_status location objectType', 632 | 'twitter_quoted_status location twitter_country_code', 633 | 'twitter_quoted_status location twitter_place_type', 634 | 'twitter_quoted_status long_object body', 635 | 'twitter_quoted_status long_object display_text_range', 636 | 'twitter_quoted_status long_object twitter_entities hashtags', 637 | 'twitter_quoted_status long_object twitter_entities media', 638 | 'twitter_quoted_status long_object twitter_entities symbols', 639 | 'twitter_quoted_status long_object twitter_entities urls', 640 | 'twitter_quoted_status long_object twitter_entities user_mentions', 641 | 'twitter_quoted_status long_object twitter_extended_entities media', 642 | 'twitter_quoted_status object id', 643 | 'twitter_quoted_status object link', 644 | 'twitter_quoted_status object objectType', 645 | 'twitter_quoted_status object postedTime', 646 | 'twitter_quoted_status object summary', 647 | 'twitter_quoted_status objectType', 648 | 'twitter_quoted_status postedTime', 649 | 'twitter_quoted_status provider displayName', 650 | 'twitter_quoted_status provider link', 651 | 'twitter_quoted_status provider objectType', 652 | 'twitter_quoted_status twitter_entities hashtags', 653 | 'twitter_quoted_status twitter_entities symbols', 654 | 'twitter_quoted_status twitter_entities urls', 655 | 'twitter_quoted_status twitter_entities user_mentions', 656 | 'twitter_quoted_status twitter_filter_level', 657 | 'twitter_quoted_status twitter_lang', 658 | 'twitter_quoted_status verb', 659 | 'verb', 660 | 'object twitter_entities media', 661 | 'object twitter_extended_entities media', 662 | 'actor location displayName', 663 | 'gnip profileLocations', 664 | 'actor location objectType', 665 | 'geo coordinates', 666 | 'geo type', 667 | 'twitter_quoted_status twitter_entities media', 668 | 'twitter_quoted_status twitter_extended_entities media', 669 | 'object geo coordinates', 670 | 'object geo type'} 671 | 672 | activity_streams_minimum_set_keys = { 673 | 'actor displayName', 674 | 'actor favoritesCount', 675 | 'actor followersCount', 676 | 'actor friendsCount', 677 | 'actor id', 678 | 'actor image', 679 | 'actor languages', 680 | 'actor link', 681 | 'actor links', 682 | 'actor listedCount', 683 | 'actor objectType', 684 | 'actor postedTime', 685 | 'actor preferredUsername', 686 | 'actor statusesCount', 687 | 'actor summary', 688 | 'actor twitterTimeZone', 689 | 'actor utcOffset', 690 | 'actor verified', 691 | 'body', 692 | 'favoritesCount', 693 | 'generator displayName', 694 | 'generator link', 695 | 'gnip matching_rules', 696 | 'id', 697 | 'link', 698 | 'object id', 699 | 'object link', 700 | 'object objectType', 701 | 'object postedTime', 702 | 'objectType', 703 | 'postedTime', 704 | 'provider displayName', 705 | 'provider link', 706 | 'provider objectType', 707 | 'retweetCount', 708 | 'twitter_entities hashtags', 709 | 'twitter_entities symbols', 710 | 'twitter_entities urls', 711 | 'twitter_entities user_mentions', 712 | 'twitter_filter_level', 713 | 'twitter_lang', 714 | 'verb'} 715 | 716 | -------------------------------------------------------------------------------- /tweet_parser/tweet_parser_errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright 2018 Twitter, Inc. 3 | # Licensed under the MIT License 4 | # https://opensource.org/licenses/MIT 5 | class NotATweetError(Exception): 6 | pass 7 | 8 | 9 | class NotAvailableError(Exception): 10 | pass 11 | 12 | 13 | class UnexpectedFormatError(Exception): 14 | pass 15 | --------------------------------------------------------------------------------