├── .gitignore
├── .nojekyll
├── LICENSE
├── README.rst
├── _config.yml
├── doc_build.sh
├── docs
    ├── Makefile
    ├── readme.md
    └── source
    │   ├── README.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── tweet_parser.getter_methods.rst
    │   └── tweet_parser.rst
├── setup.py
├── test
    ├── test_tweet_parser.py
    └── tweet_payload_examples
    │   ├── activity_streams_examples.json
    │   ├── broken_and_unsupported_payloads
    │       ├── activity_streams_additional_field.json
    │       ├── activity_streams_missing_field.json
    │       ├── original_format_additional_field.json
    │       ├── original_format_missing_field.json
    │       ├── original_format_missing_quotetweet_field.json
    │       ├── original_format_missing_user.json
    │       └── public_api_sample.json
    │   └── original_format_examples.json
├── tools
    ├── demo_notebook.ipynb
    └── parse_tweets.py
└── tweet_parser
    ├── __init__.py
    ├── deprecator.py
    ├── getter_methods
        ├── __init__.py
        ├── gnip_fields.py
        ├── tweet_counts.py
        ├── tweet_date.py
        ├── tweet_embeds.py
        ├── tweet_entities.py
        ├── tweet_generator.py
        ├── tweet_geo.py
        ├── tweet_links.py
        ├── tweet_reply.py
        ├── tweet_text.py
        └── tweet_user.py
    ├── lazy_property.py
    ├── tweet.py
    ├── tweet_checking.py
    ├── tweet_keys.py
    └── tweet_parser_errors.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # macosx
104 | .DS_Store
105 | 
106 | # vim
107 | *.swp
108 | *.swo
109 | *~
110 | 


--------------------------------------------------------------------------------
/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xdevplatform/tweet_parser/3435de8367d36b483a6cfd8d46cc28694ee8a42e/.nojekyll


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Twitter, Inc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Tweet Parser
  2 | ============
  3 | 
  4 | Authors: `Fiona Pigott <https://github.com/fionapigott>`__, `Jeff
  5 | Kolb <https://github.com/jeffakolb>`__, `Josh
  6 | Montague <https://github.com/jrmontag>`__, `Aaron
  7 | Gonzales <https://github.com/binaryaaron>`__
  8 | 
  9 | Goal:
 10 | -----
 11 | 
 12 | Allow reliable parsing of Tweets delivered by the Gnip platform, in both
 13 | activity-streams and original formats.
 14 | 
 15 | Status:
 16 | -------
 17 | 
 18 | This package can be installed by cloning the repo and using
 19 | ``pip install -e .``, or by using ``pip install tweet_parser``.
 20 | 
 21 | As of version 1.0.5, the package works with Python 2 and 3, and the 
 22 | API should be relatively stable. Recommended to use the more recent release. 
 23 | Current release is 1.13.2
 24 | 
 25 | Currently, this parser does not explicitly support Public API Twitter
 26 | data.
 27 | 
 28 | Usage:
 29 | ------
 30 | 
 31 | This package is intended to be used as a Python module inside your other
 32 | Tweet-related code. An example Python program (after pip installing the
 33 | package) would be:
 34 | 
 35 | .. code:: python
 36 | 
 37 |     from tweet_parser.tweet import Tweet
 38 |     from tweet_parser.tweet_parser_errors import NotATweetError
 39 |     import fileinput
 40 |     import json
 41 | 
 42 |     for line in fileinput.FileInput("gnip_tweet_data.json"):
 43 |         try:
 44 |             tweet_dict = json.loads(line)
 45 |             tweet = Tweet(tweet_dict)
 46 |         except (json.JSONDecodeError,NotATweetError):
 47 |             pass
 48 |         print(tweet.created_at_string, tweet.all_text)
 49 | 
 50 | I've also added simple command-line utility:
 51 | 
 52 | .. code:: bash
 53 | 
 54 |     python tools/parse_tweets.py -f"gnip_tweet_data.json" -c"created_at_string,all_text"
 55 | 
 56 | Testing:
 57 | --------
 58 | 
 59 | A Python ``test_tweet_parser.py`` package exists in ``test/``.
 60 | 
 61 | The most important thing that it tests is the equivalence of outputs
 62 | when comparing both activity-streams input and original-format input.
 63 | Any new getter will be tested by running
 64 | ``test$ python test_tweet_parser.py``, as the test checks every method
 65 | attached to the Tweet object, for every test tweet stored in
 66 | ``test/tweet_payload_examples``. For any cases where it is expected that
 67 | the outputs are different (e.g., outputs that depend on poll options),
 68 | conditional statements should be added to this test.
 69 | 
 70 | An option also exists for run-time checking of Tweet payload formats.
 71 | This compares the set of all Tweet field keys to a superset of all
 72 | possible keys, as well as a minimum set of all required keys, to make
 73 | sure that each newly loaded Tweet fits those parameters. This shouldn't
 74 | be run every time you load Tweets (for one, it's slow), but is
 75 | implemented to use as a periodic check against Tweet format changes.
 76 | This option is enabled with ``--do_format_validation`` on the command
 77 | line, and by setting the keyword argument ``do_format_validation`` to
 78 | ``True`` when initializing a ``Tweet`` object.
 79 | 
 80 | Contributing
 81 | ------------
 82 | 
 83 | Submit bug reports or feature requests through GitHub Issues, with
 84 | self-contained minimum working examples where appropriate.
 85 | 
 86 | To contribute code, fork this repo, create your own local feature
 87 | branch, make your changes, test them, and submit a pull request to the
 88 | master branch. The contribution guidelines specified in the ``pandas``
 89 | `documentation <http://pandas.pydata.org/pandas-docs/stable/contributing.html#working-with-the-code>`__
 90 | are a great reference.
 91 | 
 92 | When you submit a change, change the version number. For bug fixes and
 93 | non-breaking changes that do not affect the top-level Tweet object API 
 94 | (fixing a bug or changing the internals of a getter while package naming/structure
 95 | remains the same), increment the last number (X.Y.Z -> X.Y.Z+1) in
 96 | ``setup.py``. For changes that do affect the top-level Tweet object API (e.g., adding a 
 97 | new getter), increment the middle number (X.Y.Z -> X.Y+1.0).
 98 | 
 99 | Guidelines for new getters
100 | ~~~~~~~~~~~~~~~~~~~~~~~~~~
101 | 
102 | A *getter* is a method in the Tweet class and the accompanying code in
103 | the ``getter_methods`` module. A getter for some property should:
104 | 
105 | - be named ``<property>``, a method in ``Tweet`` decorated with
106 |   ``@lazy_property``
107 | - have a corresponding method named
108 |   ``get_<property>(tweet)`` in the ``getter_methods`` module that
109 |   implements the logic, nested uner the appropriate submodule (a text
110 |   property probably lives under the ``getter_methods.tweet_text``
111 |   submodule) 
112 | - provide the exact same output for original format and
113 |   activity-streams format Tweet input, except in the case where certain
114 |   information is unavailable (see ``get_poll_options``).
115 | 
116 | In general, prefer that the ``get_<property>`` work on a simple Tweet
117 | dictionary as well as a Tweet object (this makes unit testing easier).
118 | This means that you might use ``is_original_format(tweet)`` rather than
119 | ``tweet.is_original_format`` to check format inside of a getter.
120 | 
121 | Adding unit tests for your getter in the docstrings in the "Example"
122 | section is helpful. See existing getters for examples.
123 | 
124 | In general, make detailed docstrings with examples in
125 | ``get_<property>``, and more concise dosctrings in ``Tweet``, with a
126 | reference for where to find the ``get_<property>`` getter that
127 | implements the logic.
128 | 
129 | Style
130 | ~~~~~
131 | 
132 | Adhere to the PEP8 style. Using a Python linter (like flake8) is
133 | reccomended.
134 | 
135 | For documentation style, use `Google-style
136 | docstrings <http://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html>`__.
137 | Refer to the `Python docstest
138 | documentation <https://docs.python.org/3/library/doctest.html>`__ for
139 | doctest guidelines.
140 | 
141 | Testing
142 | ~~~~~~~
143 | 
144 | Create an isolated virtual environment for testing (there are currently
145 | no external dependencies for this library).
146 | 
147 | Test your new feature by reinstalling the library in your virtual
148 | environment and running the test script as shown below. Fix any issues
149 | until all tests pass.
150 | 
151 | .. code-block:: bash
152 | 
153 |     (env) [tweet_parser]$ pip install -e .
154 |     (env) [tweet_parser]$ cd test/; python test_tweet_parser.py; cd -
155 | 
156 | Furthermore, if contributing a new accessor or getter method for payload
157 | elements, verify the code works as you intended by running the
158 | ``parse_tweets.py`` script with your new field, as shown below. Check
159 | that both input types produce the intended output.
160 | 
161 | Note that FieldDeprecationWarnings will appear while testing for certain getters, this is expected behavior.
162 | 
163 | .. code-block:: bash
164 | 
165 |     (env) [tweet_parser]$ pip install -e .
166 |     (env) [tweet_parser]$ python tools/parse_tweets.py -f test/tweet_payload_examples/activity_streams_examples.json -c <your new field>
167 | 
168 | And lastly, if you've added new docstrings and doctests, from the
169 | ``docs`` directory, run ``make html`` (to check docstring formatting)
170 | and ``make doctest`` to run the doctests.
171 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman


--------------------------------------------------------------------------------
/doc_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -ne 1 ]; then
 4 |   echo "Error: Please provide a branch name from which documentation will be built";
 5 |   exit 1
 6 | fi
 7 | 
 8 | BRANCH_NAME=$1
 9 | 
10 | echo "Building documentation from $BRANCH_NAME"
11 | echo "checking out gh-pages"
12 | if ! git checkout gh-pages
13 | then
14 |   echo >&2 "checkout of gh-pages branch failed; please ensure you have local changes commited prior to running this script "
15 |   echo "exiting"
16 |   exit 1
17 | fi
18 | 
19 | pwd
20 | echo "removing current files"
21 | rm -rf *.egg-info
22 | 
23 | git pull origin gh-pages
24 | # clean old doc build
25 | rm -r *.html *.js _sources/ _static/
26 | 
27 | # ensure .nojekyll is here
28 | touch .nojekyll
29 | 
30 | # grab the correct stuff from the build branch
31 | git checkout $BRANCH_NAME docs tweet_parser README.rst
32 | 
33 | mv docs/* .
34 | cp README.rst source/README.rst
35 | make html
36 | mv -fv build/html/* ./
37 | rm -r tweet_parser docs build Makefile source __pycache__/ 
38 | echo "--------------------------------------------------------------------"
39 | echo " docs built; please review these changes and then run the following:"
40 | echo "--------------------------------------------------------------------"
41 | echo git add -A
42 | echo git commit -m \"Generated gh-pages for `git log master -1 --pretty=short --abbrev-commit | grep commit`\"
43 | echo git push origin gh-pages
44 | echo git checkout $BRANCH_NAME
45 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = TweetParser
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | GH_PAGES_SOURCES = docs tweet_parser
11 | 
12 | # Put it first so that "make" without argument is like "make help".
13 | help:
14 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
15 | 
16 | .PHONY: help Makefile
17 | 
18 | # Catch-all target: route all unknown targets to Sphinx using the new
19 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
20 | %: Makefile
21 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 | 
23 | 


--------------------------------------------------------------------------------
/docs/readme.md:
--------------------------------------------------------------------------------
 1 | ## Documentation
 2 | We are using Sphinx with Google-style docstrings to build our documentation. It's a fairly straightforward process to build the docs locally to preview your changes. There is a script for deployment to gh-pages, described below.
 3 | 
 4 | ### Setup
 5 | 
 6 | We obviously require sphinx for this, but (sphinx_bootstrap)[https://github.com/ryan-roemer/sphinx-bootstrap-theme] is required for building the docs in Bootstrap.
 7 | 
 8 | on osx:
 9 | 
10 | ```.bash
11 | pip install sphinx
12 | pip install recommonmark
13 | pip install sphinx-bootstrap-theme
14 | pip install sphinxcontrib-napoleon
15 | ```
16 | 
17 | ### Build
18 | 
19 | This will build the docs locally for testing and future deployment.
20 | 
21 | ```.bash
22 | cd tweet_parser/docs
23 | make clean
24 | make html
25 | ```
26 | 
27 | ### Deploying to github pages
28 | From the root of the repo run:
29 | 
30 | ```.bash
31 | bash doc_build.sh <BRANCH_NAME>
32 | ```
33 | 
34 | where `<BRANCH_NAME>` is the name of the branch you'll be building from, most likely master. The script will change to the `gh-pages` branch, clean out the olds docs, pull your changes from the relevant branch, build them, and give you instructions for review and commands for deployment.
35 | 


--------------------------------------------------------------------------------
/docs/source/README.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../README.rst
2 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # Tweet Parser documentation build configuration file, created by
  5 | # sphinx-quickstart on Wed Aug  9 13:44:53 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | 
 23 | sys.path.insert(0, os.path.abspath('../'))
 24 | sys.path.insert(0, os.path.abspath('.'))
 25 | 
 26 | 
 27 | import sphinx_bootstrap_theme
 28 | 
 29 | # -- General configuration ------------------------------------------------
 30 | 
 31 | # If your documentation needs a minimal Sphinx version, state it here.
 32 | #
 33 | # needs_sphinx = '1.0'
 34 | 
 35 | # Add any Sphinx extension module names here, as strings. They can be
 36 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 37 | # ones.
 38 | extensions = ['sphinx.ext.autodoc',
 39 |     'sphinx.ext.doctest',
 40 |     'sphinx.ext.intersphinx',
 41 |     'sphinx.ext.todo',
 42 |     'sphinx.ext.coverage',
 43 |     'sphinx.ext.viewcode',
 44 |     'sphinxcontrib.napoleon',
 45 |     'sphinx.ext.githubpages']
 46 | 
 47 | 
 48 | # Add any paths that contain templates here, relative to this directory.
 49 | templates_path = ['_templates']
 50 | 
 51 | # The suffix(es) of source filenames.
 52 | # You can specify multiple suffix as a list of string:
 53 | 
 54 | source_parsers = {
 55 |    '.md': 'recommonmark.parser.CommonMarkParser',
 56 | }
 57 | source_suffix = ['.rst', '.md']
 58 | 
 59 | 
 60 | # The master toctree document.
 61 | master_doc = 'index'
 62 | 
 63 | # General information about the project.
 64 | project = 'Tweet Parser'
 65 | copyright = '2017, Twitter DDIS'
 66 | author = 'Twitter DDIS'
 67 | 
 68 | # The version info for the project you're documenting, acts as replacement for
 69 | # |version| and |release|, also used in various other places throughout the
 70 | # built documents.
 71 | #
 72 | # The short X.Y version.
 73 | version = '1'
 74 | # The full version, including alpha/beta/rc tags.
 75 | release = '1'
 76 | 
 77 | # The language for content autogenerated by Sphinx. Refer to documentation
 78 | # for a list of supported languages.
 79 | #
 80 | # This is also used if you do content translation via gettext catalogs.
 81 | # Usually you set "language" from the command line for these cases.
 82 | language = None
 83 | 
 84 | # List of patterns, relative to source directory, that match files and
 85 | # directories to ignore when looking for source files.
 86 | # This patterns also effect to html_static_path and html_extra_path
 87 | exclude_patterns = []
 88 | 
 89 | # The name of the Pygments (syntax highlighting) style to use.
 90 | pygments_style = 'sphinx'
 91 | 
 92 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 93 | todo_include_todos = False
 94 | 
 95 | 
 96 | # -- Options for HTML output ----------------------------------------------
 97 | 
 98 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 99 | # a list of builtin themes.
100 | #
101 | # Theme options are theme-specific and customize the look and feel of a theme
102 | # further.  For a list of options available for each theme, see the
103 | # documentation.
104 | #
105 | # html_theme_options = {}
106 | 
107 | # Add any paths that contain custom static files (such as style sheets) here,
108 | # relative to this directory. They are copied after the builtin static files,
109 | # so a file named "default.css" will overwrite the builtin "default.css".
110 | html_static_path = []
111 | 
112 | 
113 | # html_theme = 'alabaster'
114 | # html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
115 | html_theme = 'bootstrap'
116 | html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
117 | 
118 | html_theme_options = {
119 |     # Navigation bar title. (Default: ``project`` value)
120 |     'navbar_title': "Tweet Parser",
121 | 
122 |     # Tab name for entire site. (Default: "Site")
123 |     'navbar_site_name': "Tweet Parser",
124 | 
125 |     # A list of tuples containing pages or urls to link to.
126 |     # Valid tuples should be in the following forms:
127 |     #    (name, page)                 # a link to a page
128 |     #    (name, "/aa/bb", 1)          # a link to an arbitrary relative url
129 |     #    (name, "http://example.com", True) # arbitrary absolute url
130 |     # Note the "1" or "True" value above as the third argument to indicate
131 |     # an arbitrary url.
132 |     'navbar_links': [
133 |         ("Github", "https://github.com/twitterdev/tweet_parser", True),
134 |     ],
135 | 
136 |     # Render the next and previous page links in navbar. (Default: true)
137 |     'navbar_sidebarrel': True,
138 | 
139 |     # Render the current pages TOC in the navbar. (Default: true)
140 |     'navbar_pagenav': True,
141 | 
142 |     # Tab name for the current pages TOC. (Default: "Page")
143 |     'navbar_pagenav_name': "Page",
144 | 
145 |     # Global TOC depth for "site" navbar tab. (Default: 1)
146 |     # Switching to -1 shows all levels.
147 |     'globaltoc_depth': 2,
148 | 
149 |     # Include hidden TOCs in Site navbar?
150 |     #
151 |     # Note: If this is "false", you cannot have mixed ``:hidden:`` and
152 |     # non-hidden ``toctree`` directives in the same page, or else the build
153 |     # will break.
154 |     #
155 |     # Values: "true" (default) or "false"
156 |     'globaltoc_includehidden': "true",
157 | 
158 |     # HTML navbar class (Default: "navbar") to attach to <div> element.
159 |     # For black navbar, do "navbar navbar-inverse"
160 |     'navbar_class': "navbar navbar-inverse",
161 | 
162 |     # Fix navigation bar to top of page?
163 |     # Values: "true" (default) or "false"
164 |     'navbar_fixed_top': "true",
165 | 
166 |     # Location of link to source.
167 |     # Options are "nav" (default), "footer" or anything else to exclude.
168 |     'source_link_position': None,
169 | 
170 |     # Bootswatch (http://bootswatch.com/) theme.
171 |     #
172 |     # Options are nothing (default) or the name of a valid theme
173 |     # such as "cosmo" or "sandstone".
174 |     'bootswatch_theme': "cosmo",
175 | 
176 |     # Choose Bootstrap version.
177 |     # Values: "3" (default) or "2" (in quotes)
178 |     'bootstrap_version': "3",
179 | }
180 | 
181 | # -- Options for HTMLHelp output ------------------------------------------
182 | 
183 | # Output file base name for HTML help builder.
184 | htmlhelp_basename = 'TweetParserdoc'
185 | 
186 | 
187 | # -- Options for LaTeX output ---------------------------------------------
188 | 
189 | latex_elements = {
190 |     # The paper size ('letterpaper' or 'a4paper').
191 |     #
192 |     # 'papersize': 'letterpaper',
193 | 
194 |     # The font size ('10pt', '11pt' or '12pt').
195 |     #
196 |     # 'pointsize': '11pt',
197 | 
198 |     # Additional stuff for the LaTeX preamble.
199 |     #
200 |     # 'preamble': '',
201 | 
202 |     # Latex figure (float) alignment
203 |     #
204 |     # 'figure_align': 'htbp',
205 | }
206 | 
207 | # Grouping the document tree into LaTeX files. List of tuples
208 | # (source start file, target name, title,
209 | #  author, documentclass [howto, manual, or own class]).
210 | latex_documents = [
211 |     (master_doc, 'TweetParser.tex', 'Tweet Parser Documentation',
212 |      'Twitter DDIS', 'manual'),
213 | ]
214 | 
215 | 
216 | # -- Options for manual page output ---------------------------------------
217 | 
218 | # One entry per manual page. List of tuples
219 | # (source start file, name, description, authors, manual section).
220 | man_pages = [
221 |     (master_doc, 'tweetparser', 'Tweet Parser Documentation',
222 |      [author], 1)
223 | ]
224 | 
225 | 
226 | # -- Options for Texinfo output -------------------------------------------
227 | 
228 | # Grouping the document tree into Texinfo files. List of tuples
229 | # (source start file, target name, title, author,
230 | #  dir menu entry, description, category)
231 | texinfo_documents = [
232 |     (master_doc, 'TweetParser', 'Tweet Parser Documentation',
233 |      author, 'TweetParser', 'One line description of project.',
234 |      'Miscellaneous'),
235 | ]
236 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. Tweet Parser documentation master file, created by
 2 |    sphinx-quickstart on Wed Aug  9 13:44:53 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | .. include:: README.rst
 6 | 
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 3
10 |    :caption: Contents:
11 | 
12 |    self
13 |    tweet_parser
14 | 
15 | 
16 | 
17 | Indices and tables
18 | ==================
19 | 
20 | * :ref:`genindex`
21 | * :ref:`modindex`
22 | * :ref:`search`
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/docs/source/tweet_parser.getter_methods.rst:
--------------------------------------------------------------------------------
  1 | tweet\_parser\.getter\_methods package
  2 | ======================================
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | tweet\_parser\.getter\_methods\.gnip\_fields module
  8 | ---------------------------------------------------
  9 | 
 10 | .. automodule:: tweet_parser.getter_methods.gnip_fields
 11 |     :members:
 12 |     :undoc-members:
 13 |     :show-inheritance:
 14 | 
 15 | tweet\_parser\.getter\_methods\.tweet\_counts module
 16 | ----------------------------------------------------
 17 | 
 18 | .. automodule:: tweet_parser.getter_methods.tweet_counts
 19 |     :members:
 20 |     :undoc-members:
 21 |     :show-inheritance:
 22 | 
 23 | tweet\_parser\.getter\_methods\.tweet\_date module
 24 | --------------------------------------------------
 25 | 
 26 | .. automodule:: tweet_parser.getter_methods.tweet_date
 27 |     :members:
 28 |     :undoc-members:
 29 |     :show-inheritance:
 30 | 
 31 | tweet\_parser\.getter\_methods\.tweet\_embeds module
 32 | ----------------------------------------------------
 33 | 
 34 | .. automodule:: tweet_parser.getter_methods.tweet_embeds
 35 |     :members:
 36 |     :undoc-members:
 37 |     :show-inheritance:
 38 | 
 39 | tweet\_parser\.getter\_methods\.tweet\_entities module
 40 | ------------------------------------------------------
 41 | 
 42 | .. automodule:: tweet_parser.getter_methods.tweet_entities
 43 |     :members:
 44 |     :undoc-members:
 45 |     :show-inheritance:
 46 | 
 47 | tweet\_parser\.getter\_methods\.tweet\_generator module
 48 | -------------------------------------------------------
 49 | 
 50 | .. automodule:: tweet_parser.getter_methods.tweet_generator
 51 |     :members:
 52 |     :undoc-members:
 53 |     :show-inheritance:
 54 | 
 55 | tweet\_parser\.getter\_methods\.tweet\_geo module
 56 | -------------------------------------------------
 57 | 
 58 | .. automodule:: tweet_parser.getter_methods.tweet_geo
 59 |     :members:
 60 |     :undoc-members:
 61 |     :show-inheritance:
 62 | 
 63 | tweet\_parser\.getter\_methods\.tweet\_links module
 64 | ---------------------------------------------------
 65 | 
 66 | .. automodule:: tweet_parser.getter_methods.tweet_links
 67 |     :members:
 68 |     :undoc-members:
 69 |     :show-inheritance:
 70 | 
 71 | tweet\_parser\.getter\_methods\.tweet\_reply module
 72 | ---------------------------------------------------
 73 | 
 74 | .. automodule:: tweet_parser.getter_methods.tweet_reply
 75 |     :members:
 76 |     :undoc-members:
 77 |     :show-inheritance:
 78 | 
 79 | tweet\_parser\.getter\_methods\.tweet\_text module
 80 | --------------------------------------------------
 81 | 
 82 | .. automodule:: tweet_parser.getter_methods.tweet_text
 83 |     :members:
 84 |     :undoc-members:
 85 |     :show-inheritance:
 86 | 
 87 | tweet\_parser\.getter\_methods\.tweet\_user module
 88 | --------------------------------------------------
 89 | 
 90 | .. automodule:: tweet_parser.getter_methods.tweet_user
 91 |     :members:
 92 |     :undoc-members:
 93 |     :show-inheritance:
 94 | 
 95 | 
 96 | Module contents
 97 | ---------------
 98 | 
 99 | .. automodule:: tweet_parser.getter_methods
100 |     :members:
101 |     :undoc-members:
102 |     :show-inheritance:
103 | 


--------------------------------------------------------------------------------
/docs/source/tweet_parser.rst:
--------------------------------------------------------------------------------
 1 | tweet\_parser package
 2 | =====================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     tweet_parser.getter_methods
10 | 
11 | Submodules
12 | ----------
13 | 
14 | tweet\_parser\.lazy\_property module
15 | ------------------------------------
16 | 
17 | .. automodule:: tweet_parser.lazy_property
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | tweet\_parser\.tweet module
23 | ---------------------------
24 | 
25 | .. automodule:: tweet_parser.tweet
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | tweet\_parser\.tweet\_checking module
31 | -------------------------------------
32 | 
33 | .. automodule:: tweet_parser.tweet_checking
34 |     :members:
35 |     :undoc-members:
36 |     :show-inheritance:
37 | 
38 | tweet\_parser\.tweet\_keys module
39 | ---------------------------------
40 | 
41 | .. automodule:: tweet_parser.tweet_keys
42 |     :members:
43 |     :undoc-members:
44 |     :show-inheritance:
45 | 
46 | tweet\_parser\.tweet\_parser\_errors module
47 | -------------------------------------------
48 | 
49 | .. automodule:: tweet_parser.tweet_parser_errors
50 |     :members:
51 |     :undoc-members:
52 |     :show-inheritance:
53 | 
54 | 
55 | Module contents
56 | ---------------
57 | 
58 | .. automodule:: tweet_parser
59 |     :members:
60 |     :undoc-members:
61 |     :show-inheritance:
62 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright 2018 Twitter, Inc.
 3 | # Licensed under the MIT License
 4 | # https://opensource.org/licenses/MIT
 5 | from setuptools import setup, find_packages
 6 | 
 7 | setup(name='tweet_parser',
 8 |       description="Tools for Tweet parsing",
 9 |       url='https://github.com/twitterdev/tweet_parser',
10 |       author='Fiona Pigott, Jeff Kolb, Josh Montague, Aaron Gonzales',
11 |       long_description=open('README.rst', 'r').read(),
12 |       author_email='fpigott@twitter.com',
13 |       license='MIT',
14 |       version='1.13.2',
15 |       packages=find_packages(),
16 |       scripts=["tools/parse_tweets.py"],
17 |       install_requires=[],
18 |      )
19 | 


--------------------------------------------------------------------------------
/test/test_tweet_parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright 2018 Twitter, Inc.
  3 | # Licensed under the MIT License
  4 | # https://opensource.org/licenses/MIT
  5 | import unittest
  6 | import fileinput
  7 | import json
  8 | import warnings
  9 | from tweet_parser.tweet import Tweet
 10 | from tweet_parser import tweet_checking
 11 | from tweet_parser.tweet_parser_errors import NotATweetError, NotAvailableError, UnexpectedFormatError
 12 | from tweet_parser.deprecator import FieldDeprecationWarning
 13 | 
 14 | def make_a_string(data):
 15 |     if type(data) == str:
 16 |         return data
 17 |     elif type(data) == set:
 18 |         return "{" + ", ".join(sorted(list(data))) + "}"
 19 |     else:
 20 |         return data.__repr__()
 21 | 
 22 | 
 23 | class TestTweetMethods(unittest.TestCase):
 24 | 
 25 |     def setUp(self):
 26 |         tweet_payloads = {}
 27 |         tweet_payloads["activity_streams"] = {}
 28 |         tweet_payloads["original_format"] = {}
 29 |         tweet_ids = []
 30 |         for line in fileinput.FileInput("tweet_payload_examples/activity_streams_examples.json"):
 31 |             tweet = Tweet(json.loads(line))
 32 |             tweet_ids.append(tweet.id)
 33 |             tweet_payloads["activity_streams"][tweet.id] = tweet
 34 |         for line in fileinput.FileInput("tweet_payload_examples/original_format_examples.json"):
 35 |             tweet = Tweet(json.loads(line))
 36 |             tweet_ids.append(tweet.id)
 37 |             tweet_payloads["original_format"][tweet.id] = tweet
 38 |         self.tweet_payloads = tweet_payloads
 39 |         self.tweet_ids = list(set(tweet_ids))
 40 | 
 41 |     def test_equivalent_formats(self):
 42 |         list_of_attrs = sorted([x for x in list(set(dir(Tweet)) - set(dir(dict))) if x[0] != "_"])
 43 |         for tweet_id in self.tweet_ids:
 44 |             # we know that we can't get polls in activity streams
 45 |             if self.tweet_payloads["original_format"][tweet_id].poll_options == []:
 46 |                 for attr in list_of_attrs:
 47 |                     try:
 48 |                         orig = getattr(self.tweet_payloads["original_format"][tweet_id], attr)
 49 |                         if type(orig) == Tweet:
 50 |                             orig = orig.id
 51 |                     except NotAvailableError as e:
 52 |                         orig = e.__repr__()
 53 |                     try:
 54 |                         acti = getattr(self.tweet_payloads["activity_streams"][tweet_id], attr)
 55 |                         if type(acti) == Tweet:
 56 |                             acti = acti.id
 57 |                         acti = acti
 58 |                     except NotAvailableError as e:
 59 |                         acti = e.__repr__()
 60 |                     # for some reason the ["body"]/["text"] truncations are different in as vs og
 61 |                     if attr == "text":
 62 |                         orig = orig[0:100]
 63 |                         acti = acti[0:100]
 64 |                     if attr not in ["poll_options","in_reply_to_user_id","quote_count"]:  # will raise an error in activity streams
 65 |                         self.assertEqual(orig, acti)
 66 | 
 67 |     def test_bad_payloads(self):
 68 |         # missing the user field, raises a "NotATweetError"
 69 |         with self.assertRaises(NotATweetError):
 70 |             f = open("tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_user.json", "r")
 71 |             tweet = json.load(f)
 72 |             f.close()
 73 |             Tweet(tweet)
 74 |         # missing a different required field, raises "UnexpectedFormatError"
 75 |         with self.assertRaises(UnexpectedFormatError):
 76 |             f = open("tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_field.json", "r")
 77 |             tweet = json.load(f)
 78 |             f.close()
 79 |             Tweet(tweet, do_format_validation=True)
 80 |         # missing a different required field, raises "UnexpectedFormatError"
 81 |         with self.assertRaises(UnexpectedFormatError):
 82 |             f = open("tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_missing_field.json", "r")
 83 |             tweet = json.load(f)
 84 |             f.close()
 85 |             Tweet(tweet, do_format_validation=True)
 86 |         # added a new field, raises "UnexpectedFormatError"
 87 |         with self.assertRaises(UnexpectedFormatError):
 88 |             f = open("tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_additional_field.json", "r")
 89 |             tweet = json.load(f)
 90 |             f.close()
 91 |             Tweet(tweet, do_format_validation=True)
 92 |         # added a new field, raises "UnexpectedFormatError"
 93 |         with self.assertRaises(UnexpectedFormatError):
 94 |             f = open("tweet_payload_examples/broken_and_unsupported_payloads/original_format_additional_field.json", "r")
 95 |             tweet = json.load(f)
 96 |             f.close()
 97 |             Tweet(tweet, do_format_validation=True)
 98 |         # note: these tests aren't going to cover some kinds of malformed payloads (i.e., "quote tweet" section is missing fields)
 99 | 
100 |     def test_check_format(self):
101 |         superset = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}
102 |         minset = {2, 4, 6, 8, 10}
103 |         too_many_keys = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}
104 |         too_few_keys = {2, 4, 6, 8}
105 |         just_right = {1, 2, 4, 6, 8, 10}
106 |         with self.assertRaises(UnexpectedFormatError) as exception:
107 |             tweet_checking.key_validation_check(too_many_keys, superset, minset)
108 |         with self.assertRaises(UnexpectedFormatError) as exception:
109 |             tweet_checking.key_validation_check(too_few_keys, superset, minset)
110 |         self.assertEqual(0, tweet_checking.key_validation_check(just_right, superset, minset))
111 | 
112 |     def test_get_all_keys(self):
113 |         # define a test nested dict:
114 |         test_dict = {"a": {"b": "c", "d": {"e": "f", "g": "h"}}, "i": "j"}
115 |         self.assertEqual(set(tweet_checking.get_all_keys(test_dict)), {"a b", "a d e", "a d g", "i"})
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     #with warnings.catch_warnings():
120 |     #    warnings.simplefilter("ignore", FieldDeprecationWarning)
121 |     unittest.main()
122 | 


--------------------------------------------------------------------------------
/test/tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_additional_field.json:
--------------------------------------------------------------------------------
1 | {"extra_field":"stuff","lang":"en","id":867503895978754048,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","country_code":"CL"},"source":"<a href=\"http:\/\/twitter.com\" rel=\"nofollow\">Twitter Web Client<\/a>","geo":null,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"I) I almost forgot to include a poll Tweet!","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"filter_level":"low","entities":{"urls":[],"user_mentions":[],"hashtags":[],"polls":[{"duration_minutes":1440,"end_datetime":"Thu May 25 22:13:40 +0000 2017","options":[{"position":1,"text":"\ud83d\ude04"},{"position":2,"text":"\ud83e\udd10"},{"position":3,"text":"\ud83d\ude31"}]}],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867503895978754048","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955840,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","location":null,"profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http:\/\/klout.com\/topic\/id\/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http:\/\/klout.com\/topic\/id\/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http:\/\/klout.com\/topic\/id\/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http:\/\/klout.com\/topic\/id\/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http:\/\/klout.com\/topic\/id\/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http:\/\/klout.com\/topic\/id\/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http:\/\/klout.com\/topic\/id\/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http:\/\/klout.com\/topic\/id\/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http:\/\/klout.com\/topic\/id\/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http:\/\/klout.com\/topic\/id\/10000000000000000001"}],"profile_url":"http:\/\/klout.com\/user\/id\/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","description":"Niche millennial content aggregator | Just messing with Twitter bots | github: https:\/\/github.com\/fionapigott\/fiona-bot","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Wed May 24 22:13:40 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]}
2 | 


--------------------------------------------------------------------------------
/test/tweet_payload_examples/broken_and_unsupported_payloads/activity_streams_missing_field.json:
--------------------------------------------------------------------------------
1 | {"twitter_lang":"en","actor":{"links":[{"rel":"me","href":null}],"favoritesCount":0,"displayName":"jk no","languages":["en"],"statusesCount":64,"friendsCount":0,"objectType":"person","image":"https:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","twitterTimeZone":null,"preferredUsername":"RobotPrincessFi","verified":false,"followersCount":2,"utcOffset":null,"id":"id:twitter.com:815279070241955840","link":"http:\/\/www.twitter.com\/RobotPrincessFi","postedTime":"2016-12-31T19:30:52.362Z","listedCount":7},"location":{"displayName":"Las Condes, Chile","twitter_country_code":"CL","link":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","name":"Las Condes","objectType":"place","twitter_place_type":"city","country_code":"Chile","geo":{"type":"Polygon","coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]]}},"body":"K) Let's try just one photo! \ud83d\udc30 https:\/\/t.co\/ubynnad49V","retweetCount":0,"id":"tag:search.twitter.com,2005:867834809732677634","objectType":"activity","postedTime":"2017-05-25T20:08:36.000Z","verb":"post","object":{"summary":"K) Let's try just one photo! \ud83d\udc30 https:\/\/t.co\/ubynnad49V","id":"object:search.twitter.com,2005:867834809732677634","objectType":"note","postedTime":"2017-05-25T20:08:36.000Z","link":"http:\/\/twitter.com\/RobotPrincessFi\/statuses\/867834809732677634"},"display_text_range":[0,30],"provider":{"link":"http:\/\/www.twitter.com","objectType":"service","displayName":"Twitter"},"link":"http:\/\/twitter.com\/RobotPrincessFi\/statuses\/867834809732677634","favoritesCount":0,"twitter_filter_level":"low","twitter_entities":{"hashtags":[],"urls":[],"user_mentions":[],"media":[{"media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1","type":"photo","indices":[31,54],"url":"https:\/\/t.co\/ubynnad49V","id":867834752379826177,"id_str":"867834752379826177","display_url":"pic.twitter.com\/ubynnad49V","sizes":{"large":{"w":355,"resize":"fit","h":236},"thumb":{"w":150,"resize":"crop","h":150},"medium":{"w":355,"resize":"fit","h":236},"small":{"w":355,"resize":"fit","h":236}}}],"symbols":[]},"gnip":{"matching_rules":[{"tag":null}],"klout_score":17,"klout_profile":{"link":"http:\/\/klout.com\/user\/id\/133700650730839424","topics":[{"link":"http:\/\/klout.com\/topic\/id\/10000000000000016635","id":"10000000000000016635","displayName":"Technology","score":0.53,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/5227535270209280137","id":"5227535270209280137","displayName":"Latin","score":0.47,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/7225108339966145103","id":"7225108339966145103","displayName":"Beer","score":0.47,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000016634","id":"10000000000000016634","displayName":"Business","score":0.44,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000000001","id":"10000000000000000001","displayName":"Food and Drink","score":0.43,"topic_type":"influence"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000008253","id":"10000000000000008253","displayName":"Twitter","score":0.86,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/7783102141237674703","id":"7783102141237674703","displayName":"Media","score":0.71,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/10000000000000019376","id":"10000000000000019376","displayName":"Emoji","score":0.5,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/9219221220892057324","id":"9219221220892057324","displayName":"C++","score":0.44,"topic_type":"interest"},{"link":"http:\/\/klout.com\/topic\/id\/7003086526134829815","id":"7003086526134829815","displayName":"Boulder","score":0.43,"topic_type":"interest"}],"klout_user_id":"133700650730839424"}},"twitter_extended_entities":{"media":[{"media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1","type":"photo","indices":[31,54],"url":"https:\/\/t.co\/ubynnad49V","id":867834752379826177,"id_str":"867834752379826177","display_url":"pic.twitter.com\/ubynnad49V","sizes":{"large":{"w":355,"resize":"fit","h":236},"thumb":{"w":150,"resize":"crop","h":150},"medium":{"w":355,"resize":"fit","h":236},"small":{"w":355,"resize":"fit","h":236}}}]},"generator":{"link":"http:\/\/twitter.com","displayName":"Twitter Web Client"}}
2 | 


--------------------------------------------------------------------------------
/test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_additional_field.json:
--------------------------------------------------------------------------------
1 | {"unexpected_field":"blahhhhhh","display_text_range":[0,30],"lang":"en","id":867834809732677634,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","country_code":"CL"},"source":"<a href=\"http:\/\/twitter.com\" rel=\"nofollow\">Twitter Web Client<\/a>","geo":null,"possibly_sensitive":false,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"K) Let's try just one photo! \ud83d\udc30 https:\/\/t.co\/ubynnad49V","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"extended_entities":{"media":[{"indices":[31,54],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867834752379826177,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","url":"https:\/\/t.co\/ubynnad49V","type":"photo","display_url":"pic.twitter.com\/ubynnad49V","id_str":"867834752379826177","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1"}]},"filter_level":"low","entities":{"urls":[],"user_mentions":[],"media":[{"indices":[31,54],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867834752379826177,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsrG_tWsAEzeEy.jpg","url":"https:\/\/t.co\/ubynnad49V","type":"photo","display_url":"pic.twitter.com\/ubynnad49V","id_str":"867834752379826177","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867834809732677634\/photo\/1"}],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867834809732677634","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955840,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","location":null,"profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http:\/\/klout.com\/topic\/id\/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http:\/\/klout.com\/topic\/id\/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http:\/\/klout.com\/topic\/id\/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http:\/\/klout.com\/topic\/id\/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http:\/\/klout.com\/topic\/id\/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http:\/\/klout.com\/topic\/id\/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http:\/\/klout.com\/topic\/id\/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http:\/\/klout.com\/topic\/id\/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http:\/\/klout.com\/topic\/id\/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http:\/\/klout.com\/topic\/id\/10000000000000000001"}],"profile_url":"http:\/\/klout.com\/user\/id\/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/815279765259100161\/RID37OuE_normal.jpg","description":"Niche millennial content aggregator | Just messing with Twitter bots | github: https:\/\/github.com\/fionapigott\/fiona-bot","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Thu May 25 20:08:36 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]}
2 | 


--------------------------------------------------------------------------------
/test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_field.json:
--------------------------------------------------------------------------------
1 | {"lang":"en","id":867468138991964200,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https://api.twitter.com/1.1/geo/id/00c4b64e7affea25.json","country_code":"CL"},"source":"<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>","geo":null,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"A) This is a regular old Tweet. 🐣","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"filter_level":"low","entities":{"urls":[],"user_mentions":[],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867468138991964160","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955800,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","location":null,"profile_image_url_https":"https://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http://klout.com/topic/id/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http://klout.com/topic/id/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http://klout.com/topic/id/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http://klout.com/topic/id/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http://klout.com/topic/id/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http://klout.com/topic/id/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http://klout.com/topic/id/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http://klout.com/topic/id/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http://klout.com/topic/id/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http://klout.com/topic/id/10000000000000000001"}],"profile_url":"http://klout.com/user/id/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Wed May 24 19:51:35 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]}
2 | 


--------------------------------------------------------------------------------
/test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_quotetweet_field.json:
--------------------------------------------------------------------------------
1 | {"display_text_range":[0,140],"lang":"en","id":867479301360205800,"place":null,"source":"<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>","geo":null,"possibly_sensitive":false,"truncated":true,"is_quote_status":true,"in_reply_to_status_id_str":null,"text":"Try using \"xxd\" on this 🙃:\n(Notice the \"joiner\" characters)\n👮‍♀️👩‍🚒👩‍🔧👩‍🏭👷‍♀️👨‍🚒👨‍🌾👨‍🍳👨‍🎤👩‍🎨👨‍💼👩‍🎓👨‍🏫👨‍🎨👩‍💻👩‍🔬👨‍🚀👩‍… https://t.co/QpqFmEDBLQ","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"quoted_status_id":861652051016663000,"in_reply_to_status_id":null,"extended_tweet":{"full_text":"Try using \"xxd\" on this 🙃:\n(Notice the \"joiner\" characters)\n👮‍♀️👩‍🚒👩‍🔧👩‍🏭👷‍♀️👨‍🚒👨‍🌾👨‍🍳👨‍🎤👩‍🎨👨‍💼👩‍🎓👨‍🏫👨‍🎨👩‍💻👩‍🔬👨‍🚀👩‍⚕️👨‍✈️👨‍⚖️🕵️\n 🧙‍♂️ Magic! https://t.co/5PU0FLFRYz","display_text_range":[0,140],"entities":{"urls":[{"unwound":{"description":"“Also, if you're on a Unix machine, try using \"xxd\" to see what your text is stored as. Copy-paste: $ echo \"🙄\" | xxd $ echo \"🙄\" | xxd -b”","status":200,"title":"👸🏼🙄 Fiona on Twitter","url":"https://twitter.com/notFromShrek/status/861652051016663040"},"indices":[141,164],"expanded_url":"https://twitter.com/notFromShrek/status/861652051016663040","display_url":"twitter.com/notFromShrek/s…","url":"https://t.co/5PU0FLFRYz"}],"user_mentions":[],"hashtags":[],"symbols":[]}},"filter_level":"low","quoted_status":{"lang":"en","id":861652051016663000,"place":{"bounding_box":{"coordinates":[[[-105.301776,39.953552],[-105.301776,40.094411],[-105.183597,40.094411],[-105.183597,39.953552]]],"type":"Polygon"},"id":"fd70c22040963ac7","attributes":{},"full_name":"Boulder, CO","name":"Boulder","place_type":"city","country":"United States","url":"https://api.twitter.com/1.1/geo/id/fd70c22040963ac7.json","country_code":"US"},"source":"<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>","geo":null,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":"861651727614746624","text":"Also, if you're on a Unix machine, try using \"xxd\" to see what your text is stored as. Copy-paste:\n$ echo \"🙄\" | xxd\n$ echo \"🙄\" | xxd -b","favorite_count":0,"quote_count":0,"in_reply_to_user_id":2382763597,"in_reply_to_status_id":861651727614746600,"filter_level":"low","entities":{"urls":[],"user_mentions":[],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":"2382763597","id_str":"861652051016663040","reply_count":2,"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Mon May 08 18:40:32 +0000 2017","in_reply_to_screen_name":"notFromShrek","favorited":false},"in_reply_to_user_id_str":null,"id_str":"867479301360205824","reply_count":0,"user":{"listed_count":7,"lang":"en","id":815279070241955800,"protected":false,"verified":false,"follow_request_sent":null,"default_profile_image":false,"url":null,"id_str":"815279070241955840","friends_count":0,"profile_text_color":"000000","profile_sidebar_fill_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","name":"jk no","notifications":null,"profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","location":null,"profile_image_url_https":"https://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","geo_enabled":true,"time_zone":null,"is_translator":false,"statuses_count":64,"translator_type":"none","profile_background_tile":false,"derived":{"klout":{"score":17,"interest_topics":[{"name":"Twitter","score":0.86,"id":"10000000000000008253","url":"http://klout.com/topic/id/10000000000000008253"},{"name":"Media","score":0.71,"id":"7783102141237674703","url":"http://klout.com/topic/id/7783102141237674703"},{"name":"Emoji","score":0.5,"id":"10000000000000019376","url":"http://klout.com/topic/id/10000000000000019376"},{"name":"C++","score":0.44,"id":"9219221220892057324","url":"http://klout.com/topic/id/9219221220892057324"},{"name":"Boulder","score":0.43,"id":"7003086526134829815","url":"http://klout.com/topic/id/7003086526134829815"}],"user_id":"133700650730839424","influence_topics":[{"name":"Technology","score":0.53,"id":"10000000000000016635","url":"http://klout.com/topic/id/10000000000000016635"},{"name":"Latin","score":0.47,"id":"5227535270209280137","url":"http://klout.com/topic/id/5227535270209280137"},{"name":"Beer","score":0.47,"id":"7225108339966145103","url":"http://klout.com/topic/id/7225108339966145103"},{"name":"Business","score":0.44,"id":"10000000000000016634","url":"http://klout.com/topic/id/10000000000000016634"},{"name":"Food and Drink","score":0.43,"id":"10000000000000000001","url":"http://klout.com/topic/id/10000000000000000001"}],"profile_url":"http://klout.com/user/id/133700650730839424"}},"contributors_enabled":false,"profile_use_background_image":false,"utc_offset":null,"screen_name":"RobotPrincessFi","profile_background_color":"000000","followers_count":2,"profile_sidebar_border_color":"000000","default_profile":false,"profile_link_color":"F58EA8","following":null,"created_at":"Sat Dec 31 19:30:52 +0000 2016","profile_image_url":"http://pbs.twimg.com/profile_images/815279765259100161/RID37OuE_normal.jpg","description":"Niche millennial content aggregator | Just messing with Twitter bots | github: https://github.com/fionapigott/fiona-bot","favourites_count":0},"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"entities":{"urls":[{"indices":[117,140],"expanded_url":"https://twitter.com/i/web/status/867479301360205824","display_url":"twitter.com/i/web/status/8…","url":"https://t.co/QpqFmEDBLQ"}],"user_mentions":[],"hashtags":[],"symbols":[]},"created_at":"Wed May 24 20:35:57 +0000 2017","in_reply_to_screen_name":null,"quoted_status_id_str":"861652051016663040","favorited":false,"matching_rules":[{"tag":null}]}
2 | 


--------------------------------------------------------------------------------
/test/tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_user.json:
--------------------------------------------------------------------------------
1 | {"display_text_range":[0,48],"lang":"en","id":867833721579122688,"place":{"bounding_box":{"coordinates":[[[-70.608694,-33.486182],[-70.608694,-33.364619],[-70.429141,-33.364619],[-70.429141,-33.486182]]],"type":"Polygon"},"id":"00c4b64e7affea25","attributes":{},"full_name":"Las Condes, Chile","name":"Las Condes","place_type":"city","country":"Chile","url":"https:\/\/api.twitter.com\/1.1\/geo\/id\/00c4b64e7affea25.json","country_code":"CL"},"source":"<a href=\"http:\/\/twitter.com\" rel=\"nofollow\">Twitter Web Client<\/a>","geo":null,"possibly_sensitive":false,"truncated":false,"is_quote_status":false,"in_reply_to_status_id_str":null,"text":"J) I'm gonna include *two* photos in this Tweet! https:\/\/t.co\/iOGDJoWfME","favorite_count":0,"quote_count":0,"in_reply_to_user_id":null,"in_reply_to_status_id":null,"extended_entities":{"media":[{"indices":[49,72],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867833378313293826,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","url":"https:\/\/t.co\/iOGDJoWfME","type":"photo","display_url":"pic.twitter.com\/iOGDJoWfME","id_str":"867833378313293826","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867833721579122688\/photo\/1"},{"indices":[49,72],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":423,"w":564},"small":{"resize":"fit","h":423,"w":564},"large":{"resize":"fit","h":423,"w":564}},"id":867833707989807104,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsqKNDXsAAgcYI.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsqKNDXsAAgcYI.jpg","url":"https:\/\/t.co\/iOGDJoWfME","type":"photo","display_url":"pic.twitter.com\/iOGDJoWfME","id_str":"867833707989807104","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867833721579122688\/photo\/1"}]},"filter_level":"low","entities":{"urls":[],"user_mentions":[],"media":[{"indices":[49,72],"sizes":{"thumb":{"resize":"crop","h":150,"w":150},"medium":{"resize":"fit","h":236,"w":355},"small":{"resize":"fit","h":236,"w":355},"large":{"resize":"fit","h":236,"w":355}},"id":867833378313293826,"media_url":"http:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/DAsp3A6XUAITXbX.jpg","url":"https:\/\/t.co\/iOGDJoWfME","type":"photo","display_url":"pic.twitter.com\/iOGDJoWfME","id_str":"867833378313293826","expanded_url":"https:\/\/twitter.com\/RobotPrincessFi\/status\/867833721579122688\/photo\/1"}],"hashtags":[],"symbols":[]},"in_reply_to_user_id_str":null,"id_str":"867833721579122688","reply_count":0,"coordinates":null,"retweeted":false,"retweet_count":0,"contributors":null,"created_at":"Thu May 25 20:04:17 +0000 2017","in_reply_to_screen_name":null,"favorited":false,"matching_rules":[{"tag":null}]}
2 | 


--------------------------------------------------------------------------------
/tools/demo_notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Notebook to demonstrate the tweet_parser module\n",
  8 |     "#### Fiona Pigott"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "metadata": {
 15 |     "collapsed": true
 16 |    },
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "# import the tweet_parser module\n",
 20 |     "from tweet_parser.tweet import Tweet\n",
 21 |     "import fileinput\n",
 22 |     "import json"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# import the exact same set of Tweets in both original format and activity streams format\n",
 32 |     "activity_streams_tweets = []\n",
 33 |     "for line in fileinput.FileInput(\"../test/tweet_payload_examples/activity_streams_examples.json\"):\n",
 34 |     "    activity_streams_tweets.append(Tweet(json.loads(line)))\n",
 35 |     "\n",
 36 |     "original_format_tweets = []\n",
 37 |     "for line in fileinput.FileInput(\"../test/tweet_payload_examples/original_format_examples.json\"):\n",
 38 |     "    original_format_tweets.append(Tweet(json.loads(line)))"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "print(\"Available methods: \\n - {}\".format(\"\\n - \".join([x for x in activity_streams_tweets[0].__dir__() if x[0] != \"_\"])))"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "activity_streams_tweets[-4].tweet_links"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "for i,x in enumerate(activity_streams_tweets):\n",
 66 |     "    print(x.id, x.tweet_type, x.tweet_links)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "for i,x in enumerate(original_format_tweets):\n",
 76 |     "    print(i, \":\", x.all_text)\n",
 77 |     "    print(\"##########\")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {
 84 |     "collapsed": true
 85 |    },
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "quote_ception = original_format_tweets[2]"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "quote_ception.hashtags"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "for x in activity_streams_tweets:\n",
107 |     "    print(x.user_mentions_ids)\n",
108 |     "    print(\"##########\")"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "for x in activity_streams_tweets:\n",
118 |     "    print(x.all_text)\n",
119 |     "    print(\"##########\")"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "activity_streams_tweets[16].quoted_user"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "activity_streams_tweets[16].quote_tweet.user_mentions"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "scrolled": false
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "for i,x in enumerate(activity_streams_tweets):\n",
149 |     "    print(i,x.quoted_mentions)\n",
150 |     "    print(\"##########\")"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "for x in original_format_tweets:\n",
160 |     "    print(x.all_text_without_links)\n",
161 |     "    print(\"##########\")"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "for x in original_format_tweets:\n",
171 |     "    print(x.most_unrolled_urls)\n",
172 |     "    print(\"##########\")"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "for x in original_format_tweets:\n",
182 |     "    print(x.hashtags)\n",
183 |     "    print(\"##########\")"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "Tweet({\"thing\":\"th\"})"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "malformed_quotetweet = Tweet(json.load(\n",
202 |     "            open(\"tweet_payload_examples/broken_and_unsupported_payloads/original_format_missing_quotetweet_field.json\",\"r\")),\n",
203 |     "                                do_format_checking = True)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "malformed_quotetweet.embedded_tweet"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": []
223 |   }
224 |  ],
225 |  "metadata": {
226 |   "kernelspec": {
227 |    "display_name": "Python 3",
228 |    "language": "python",
229 |    "name": "python3"
230 |   },
231 |   "language_info": {
232 |    "codemirror_mode": {
233 |     "name": "ipython",
234 |     "version": 3
235 |    },
236 |    "file_extension": ".py",
237 |    "mimetype": "text/x-python",
238 |    "name": "python",
239 |    "nbconvert_exporter": "python",
240 |    "pygments_lexer": "ipython3",
241 |    "version": "3.6.1"
242 |   }
243 |  },
244 |  "nbformat": 4,
245 |  "nbformat_minor": 2
246 | }
247 | 


--------------------------------------------------------------------------------
/tools/parse_tweets.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright 2018 Twitter, Inc.
 3 | # Licensed under the MIT License
 4 | # https://opensource.org/licenses/MIT
 5 | 
 6 | #!/usr/bin/env python
 7 | 
 8 | from tweet_parser.tweet import Tweet
 9 | from tweet_parser.tweet_parser_errors import NotATweetError, NotAvailableError
10 | import argparse
11 | import fileinput
12 | import sys
13 | try:
14 |     import ujson as json
15 |     JSONDecodeError = ValueError
16 | except ImportError:
17 |     import json
18 |     if (sys.version_info[1] >= 5) and (sys.version_info[0] == 3):
19 |         JSONDecodeError = json.JSONDecodeError
20 |     else:
21 |         JSONDecodeError = ValueError
22 | 
23 | parser = argparse.ArgumentParser(
24 |     description="Parse seqeunce of JSON formated activities.", formatter_class=argparse.RawTextHelpFormatter)
25 | parser.add_argument("-f", "--file", dest="data_files",
26 |                     default="-",
27 |                     help="Name of the file to read from, defaults to stdin")
28 | list_of_attrs = sorted([x for x in list(set(dir(Tweet)) - set(dir(dict))) if x[0] != "_"])
29 | parser.add_argument("-c", "--csv", dest="func_list",
30 |                     default="id",
31 |                     help="comma separated list of attibutes to get \n possible functions include: \n -> {}".format(" \n -> ".join(list_of_attrs)))
32 | parser.add_argument("-d", "--delim", dest="delim",
33 |                     default="|",
34 |                     help="delimiter for the output csv, defaults to pipe")
35 | parser.add_argument("-z", "--compressed", action="store_true", dest="compressed",
36 |                     default=False,
37 |                     help="use this flag if data is compressed")
38 | parser.add_argument("-j", "--pass_bad_json", action="store_true", dest="pass_bad_json",
39 |                     default=False,
40 |                     help="use this flag to silently pass bad JSON payloads")
41 | parser.add_argument("-t", "--pass_non_tweet", action="store_true", dest="pass_non_tweet",
42 |                     default=False,
43 |                     help="use this flag to silently pass on non-tweet payloads")
44 | parser.add_argument("-a", "--pass_not_available", action="store_true", dest="pass_not_available",
45 |                     default=False,
46 |                     help="use this flag to silently pass on non-tweet payloads")
47 | parser.add_argument("--do_format_validation", action="store_true", dest="do_format_validation",
48 |                     default=False,
49 |                     help="debug formatting")
50 | options = parser.parse_args()
51 | 
52 | # get the functions that we need to use:
53 | functions = options.func_list.split(",")
54 | 
55 | # get the compression
56 | if options.compressed:
57 |     openhook = fileinput.hook_compressed
58 | else:
59 |     openhook = None
60 | # parse some tweets
61 | for line in fileinput.FileInput(options.data_files, openhook=openhook):
62 |     csv = []
63 |     # load the JSON
64 |     try:
65 |         tweet_dict = json.loads(line)
66 |     except JSONDecodeError as json_error:
67 |         if not options.pass_bad_json:
68 |             sys.stderr.write("{}. Use the flag '-j' to pass silently next time.\nBad JSON payload: {}".format(json_error, line))
69 |         continue
70 |     # load a Tweet
71 |     try:
72 |         tweet_obj = Tweet(tweet_dict, do_format_validation=options.do_format_validation)
73 |     except NotATweetError as nate:
74 |         if not options.pass_non_tweet:
75 |             sys.stderr.write("{}. Use the flag '-t' to pass silently next time.\nNon Tweet payload: {}".format(nate, line))
76 |         continue
77 |     # get the relevant fields
78 |     for func in functions:
79 |         try:
80 |             attribute = getattr(tweet_obj, func)
81 |             if sys.version_info[0] == 3:
82 |                 csv.append(str(attribute))
83 |             else:
84 |                 if isinstance(attribute, str) or isinstance(attribute, unicode):
85 |                     csv.append(attribute.encode("utf-8"))
86 |                 else:
87 |                     csv.append(str(attribute))
88 |         except NotAvailableError as nae:
89 |             if not options.pass_not_available:
90 |                 sys.stderr.write("{}. Use the flag -a to pass silently next time.\nAttribute Unavailable: {}".format(nae, line))
91 |             csv.append("NOT_AVAILABLE")
92 |     sys.stdout.write(options.delim.join(csv) + "\n")
93 | 


--------------------------------------------------------------------------------
/tweet_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xdevplatform/tweet_parser/3435de8367d36b483a6cfd8d46cc28694ee8a42e/tweet_parser/__init__.py


--------------------------------------------------------------------------------
/tweet_parser/deprecator.py:
--------------------------------------------------------------------------------
 1 | # https://stackoverflow.com/questions/2536307/decorators-in-the-python-standard-lib-deprecated-specifically
 2 | 
 3 | import functools
 4 | import inspect
 5 | import warnings
 6 | 
 7 | string_types = (type(b''), type(u''))
 8 | 
 9 | class FieldDeprecationWarning(Warning):
10 |     pass
11 | 
12 | def deprecated(reason):
13 |     """
14 |     This is a decorator which can be used to mark functions
15 |     as deprecated. It will result in a warning being emitted
16 |     when the function is used.
17 |     """
18 | 
19 |     if isinstance(reason, string_types):
20 | 
21 |         # The @deprecated is used with a 'reason'.
22 |         #
23 |         # .. code-block:: python
24 |         #
25 |         #    @deprecated("please, use another function")
26 |         #    def old_function(x, y):
27 |         #      pass
28 | 
29 |         def decorator(func1):
30 | 
31 |             if inspect.isclass(func1):
32 |                 fmt1 = "Call to deprecated class {name} ({reason})."
33 |             else:
34 |                 fmt1 = "Call to deprecated function {name} ({reason})."
35 | 
36 |             @functools.wraps(func1)
37 |             def new_func1(*args, **kwargs):
38 |                 #warnings.simplefilter('default', DeprecationWarning)
39 |                 warnings.warn(
40 |                     fmt1.format(name=func1.__name__, reason=reason),
41 |                     category=FieldDeprecationWarning,
42 |                     stacklevel=2
43 |                 )
44 |                 #warnings.simplefilter('default', DeprecationWarning)
45 |                 return func1(*args, **kwargs)
46 | 
47 |             return new_func1
48 | 
49 |         return decorator
50 | 
51 |     elif inspect.isclass(reason) or inspect.isfunction(reason):
52 | 
53 |         # The @deprecated is used without any 'reason'.
54 |         #
55 |         # .. code-block:: python
56 |         #
57 |         #    @deprecated
58 |         #    def old_function(x, y):
59 |         #      pass
60 | 
61 |         func2 = reason
62 | 
63 |         if inspect.isclass(func2):
64 |             fmt2 = "Call to deprecated class {name}."
65 |         else:
66 |             fmt2 = "Call to deprecated function {name}."
67 | 
68 |         @functools.wraps(func2)
69 |         def new_func2(*args, **kwargs):
70 |             #warnings.simplefilter('default', DeprecationWarning)
71 |             warnings.warn(
72 |                 fmt2.format(name=func2.__name__),
73 |                 category=FieldDeprecationWarning,
74 |                 stacklevel=2
75 |             )
76 |             #warnings.simplefilter('default', DeprecationWarning)
77 |             return func2(*args, **kwargs)
78 | 
79 |         return new_func2
80 | 
81 |     else:
82 |         raise TypeError(repr(type(reason)))
83 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xdevplatform/tweet_parser/3435de8367d36b483a6cfd8d46cc28694ee8a42e/tweet_parser/getter_methods/__init__.py


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/gnip_fields.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Twitter, Inc.
 2 | # Licensed under the MIT License
 3 | # https://opensource.org/licenses/MIT
 4 | from tweet_parser.tweet_checking import is_original_format
 5 | 
 6 | 
 7 | def get_matching_rules(tweet):
 8 |     """
 9 |     Retrieves the matching rules for a tweet with a gnip field enrichment.
10 | 
11 |     Args:
12 |         tweet (Tweet): the tweet
13 | 
14 |     Returns:
15 |         list: potential ``[{"tag": "user_tag", "value": "rule_value"}]`` 
16 |         pairs from standard rulesets or None if no rules or no
17 |         matching_rules field is found. \n
18 |         More information on this value at:
19 |         http://support.gnip.com/enrichments/matching_rules.html
20 | 
21 |     """
22 |     if is_original_format(tweet):
23 |         rules = tweet.get("matching_rules")
24 |     else:
25 |         gnip = tweet.get("gnip")
26 |         rules = gnip.get("matching_rules") if gnip else None
27 |     return rules
28 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_counts.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright 2018 Twitter, Inc.
 3 | # Licensed under the MIT License
 4 | # https://opensource.org/licenses/MIT
 5 | """Tweet counts and related attributes
 6 | 
 7 | This module holds attributes related to basic counts on tweets, such as
 8 | retweets, favs, and quotes. It is unlikely to be extended.
 9 | """
10 | 
11 | from tweet_parser.tweet_checking import is_original_format
12 | from tweet_parser.tweet_parser_errors import NotAvailableError
13 | 
14 | def get_retweet_count(tweet):
15 |     """
16 |     Gets the retweet count for this tweet.
17 | 
18 |     Args:
19 |         tweet (Tweet): A Tweet object (or a dictionary)
20 | 
21 |     Returns:
22 |         int: The number of times the Tweet has been retweeted
23 | 
24 |     Example:
25 |         >>> from tweet_parser.getter_methods.tweet_counts import get_retweet_count
26 |         >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z',
27 |         ...          'id_str': '2382763597',
28 |         ...          'retweet_count': 2}
29 |         >>> get_retweet_count(tweet)
30 |         2
31 | 
32 |         >>> activity_streams_tweet = {'postedTime': '2017-05-24T20:17:19.000Z',
33 |         ...                           'retweetCount': 3}
34 |         >>> get_retweet_count(activity_streams_tweet)
35 |         3
36 |     """
37 |     if is_original_format(tweet):
38 |         return tweet.get("retweet_count", 0)
39 |     else:
40 |         return tweet.get("retweetCount", 0)
41 | 
42 | 
43 | def get_favorite_count(tweet):
44 |     """
45 |     Gets the favorite count for this tweet.
46 | 
47 |     Args:
48 |         tweet (Tweet): A Tweet object (or a dictionary)
49 | 
50 |     Returns:
51 |         int: The number of times the Tweet has been favorited
52 | 
53 |     Example:
54 |         >>> from tweet_parser.getter_methods.tweet_counts import get_favorite_count
55 |         >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z',
56 |         ...          'id_str': '2382763597',
57 |         ...          'favorite_count': 2}
58 |         >>> get_favorite_count(tweet)
59 |         2
60 |         
61 |         >>> activity_streams_tweet = {'postedTime': '2017-05-24T20:17:19.000Z',
62 |         ...                           'favoritesCount': 3}
63 |         >>> get_favorite_count(activity_streams_tweet)
64 |         3
65 |     """
66 |     if is_original_format(tweet):
67 |         return tweet.get("favorite_count", 0)
68 |     else:
69 |         return tweet.get("favoritesCount", 0)
70 | 
71 | 
72 | def get_quote_count(tweet):
73 |     """
74 |     Gets the quote count for this tweet. \n 
75 |     Note that this is unavailable in activity-streams format
76 | 
77 |     Args:
78 |         tweet (Tweet): A Tweet object (or a dictionary)
79 | 
80 |     Returns:
81 |         int: The number of times the Tweet has been quoted
82 |         or for activity-streams raise a NotAvailableError
83 | 
84 |     Example:
85 |         >>> from tweet_parser.getter_methods.tweet_counts import get_quote_count
86 |         >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z',
87 |         ...          'id_str': '2382763597',
88 |         ...          'quote_count': 2}
89 |         >>> get_quote_count(tweet)
90 |         2
91 |     """
92 |     if is_original_format(tweet):
93 |         return tweet.get("quote_count", 0)
94 |     else:
95 |         raise NotAvailableError("Quote counts are only available in original format")
96 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_date.py:
--------------------------------------------------------------------------------
 1 | # Twitter Snowflake ID to timestamp (and back)
 2 | # https://github.com/client9/snowflake2time/
 3 | # Nick Galbreath @ngalbreath nickg@client9.com
 4 | # Public Domain -- No Copyright -- Cut-n-Paste
 5 | 
 6 | 
 7 | def snowflake2utc(sf):
 8 |     """
 9 |     Convert a Twitter snowflake ID to a Unix timestamp
10 |     (seconds since Jan 1 1970 00:00:00)
11 | 
12 |     Args:
13 |         sf (str): Twitter snowflake ID as a string
14 | 
15 |     Returns:
16 |         int: seconds since Jan 1 1970 00:00:00
17 |     """
18 |     sf_int = int(sf)
19 |     return int(((sf_int >> 22) + 1288834974657) / 1000.0)
20 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_embeds.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Twitter, Inc.
 2 | # Licensed under the MIT License
 3 | # https://opensource.org/licenses/MIT
 4 | from tweet_parser.tweet_checking import is_original_format
 5 | from tweet_parser.getter_methods.tweet_text import get_tweet_type
 6 | 
 7 | 
 8 | def get_quoted_tweet(tweet):
 9 |     """
10 |     Get the quoted Tweet and return it as a dictionary
11 |     If the Tweet is not a quote Tweet, return None
12 | 
13 |     Args:
14 |         tweet (Tweet or dict): A Tweet object or a dictionary
15 | 
16 |     Returns:
17 |         dict: A dictionary representing the quoted status
18 |         or None if there is no quoted status. \n
19 |         - For original format, this is the value of "quoted_status" \n
20 |         - For activity streams, this is the value of "twitter_quoted_status"
21 |     """
22 |     if get_tweet_type(tweet) == "quote":
23 |         if is_original_format(tweet):
24 |             return tweet["quoted_status"]
25 |         else:
26 |             return tweet["twitter_quoted_status"]
27 | 
28 |     else:
29 |         return None
30 | 
31 | 
32 | def get_retweeted_tweet(tweet):
33 |     """
34 |     Get the retweeted Tweet and return it as a dictionary
35 |     If the Tweet is not a Retweet, return None
36 | 
37 |     Args:
38 |         tweet (Tweet or dict): A Tweet object or a dictionary
39 | 
40 |     Returns:
41 |         dict: A dictionary representing the retweeted status
42 |         or None if there is no quoted status. \n
43 |         - For original format, this is the value of "retweeted_status" \n
44 |         - For activity streams, If the Tweet is a Retweet this is the value of the key "object"
45 |     """
46 |     if get_tweet_type(tweet) == "retweet":
47 |         if is_original_format(tweet):
48 |             return tweet["retweeted_status"]
49 |         else:
50 |             return tweet["object"]
51 |     else:
52 |         return None
53 | 
54 | 
55 | def get_embedded_tweet(tweet):
56 |     """
57 |     Get the retweeted Tweet OR the quoted Tweet and return it as a dictionary
58 | 
59 |     Args:
60 |         tweet (Tweet): A Tweet object (not simply a dict)
61 | 
62 |     Returns:
63 |         dict (or None, if the Tweet is neither a quote tweet or a Retweet):
64 |         a dictionary representing the quote Tweet or the Retweet
65 |     """
66 |     if tweet.retweeted_tweet is not None:
67 |         return tweet.retweeted_tweet
68 |     elif tweet.quoted_tweet is not None:
69 |         return tweet.quoted_tweet
70 |     else:
71 |         return None
72 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_entities.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Twitter, Inc.
  2 | # Licensed under the MIT License
  3 | # https://opensource.org/licenses/MIT
  4 | from tweet_parser.tweet_checking import is_original_format
  5 | from tweet_parser.getter_methods.tweet_embeds import get_retweeted_tweet
  6 | from tweet_parser.getter_methods.tweet_text import get_tweet_type
  7 | 
  8 | def get_entities(tweet):
  9 |     """
 10 |     Helper function to simply grabbing the entities. \n
 11 |     Caveat: In the case of Retweets, a Retweet is stored as
 12 |     "RT @someone: Some awesome status". In the case where pre-appending
 13 |     the string "RT @someone:" causes the Tweet to exceed 140 characters,
 14 |     entites (hashtags, mentions, urls) beyond the 140 character mark are
 15 |     excluded from the Retweet's entities. This seems like counterintuitive
 16 |     behavior, so we ensure here that the entities of a Retweet are a
 17 |     superset of the entities of the Retweeted status.
 18 | 
 19 |     Args:
 20 |         tweet (Tweet or dict): Tweet in question
 21 | 
 22 |     Returns:
 23 |         dict: dictionary of potential entities.
 24 | 
 25 |     Example:
 26 |         >>> from tweet_parser.getter_methods.tweet_entities import get_entities
 27 |         >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017",
 28 |         ...             "entities": {"user_mentions": [{
 29 |         ...                              "indices": [14,26], #characters where the @ mention appears
 30 |         ...                              "id_str": "2382763597", #id of @ mentioned user as a string
 31 |         ...                              "screen_name": "notFromShrek", #screen_name of @ mentioned user
 32 |         ...                              "name": "Fiona", #display name of @ mentioned user
 33 |         ...                              "id": 2382763597 #id of @ mentioned user as an int
 34 |         ...                            }]
 35 |         ...                          }
 36 |         ...             }
 37 |         >>> get_entities(original)
 38 |         {'user_mentions': [{'indices': [14, 26], 'id_str': '2382763597', 'screen_name': 'notFromShrek', 'name': 'Fiona', 'id': 2382763597}]}
 39 |         """
 40 | 
 41 |     entity_key = "entities" if is_original_format(tweet) else "twitter_entities"
 42 |     if get_tweet_type(tweet) == "retweet":
 43 |         retweet_entities = tweet.get(entity_key, [])
 44 |         all_entities = get_retweeted_tweet(tweet).get(entity_key,[]).copy()
 45 |         # the only thing that the Retweet will have that the Retweeted Tweet
 46 |         # won't have is the @-mention of the RTd user at the front ("RT @someone:")
 47 |         # I'm going to add that in, so the the Retweet's entities are a superset
 48 |         # of the RTd Tweet's entites
 49 |         all_entities["user_mentions"] = ([retweet_entities["user_mentions"][0]] +
 50 |             all_entities["user_mentions"])
 51 |         return all_entities
 52 |     else:
 53 |         return tweet.get(entity_key, [])
 54 | 
 55 | 
 56 | def get_media_entities(tweet):
 57 |     """
 58 |     Grabs all the media entities from a tweet, which are contained in the
 59 |     "extended_entities" or "twitter_extended_entities" field depending on the
 60 |     tweet format. Note that this is not the same as the first media entity from
 61 |     the basic `entities` key; this is required to get *all* of the potential
 62 |     media contained within a tweet. This is useful as an entry point for other
 63 |     functions or for any custom parsing that needs to be done.
 64 | 
 65 |     Args:
 66 |         tweet (Tweet or dict): the tweet in question
 67 | 
 68 |     Returns:
 69 |         list or None: the list of dicts containing each media's metadata in the
 70 |         tweet.
 71 | 
 72 |     Example:
 73 |         >>> from tweet_parser.getter_methods.tweet_entities import get_media_entities
 74 |         >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z',
 75 |         ...          'entities': {'user_mentions': [{'id': 2382763597,
 76 |         ...          'id_str': '2382763597',
 77 |         ...          'indices': [14, 26],
 78 |         ...          'name': 'Fiona',
 79 |         ...          'screen_name': 'notFromShrek'}]},
 80 |         ...          'extended_entities': {'media': [{'display_url': 'pic.twitter.com/something',
 81 |         ...          'expanded_url': 'https://twitter.com/something',
 82 |         ...          'id': 4242,
 83 |         ...          'id_str': '4242',
 84 |         ...          'indices': [88, 111],
 85 |         ...          'media_url': 'http://pbs.twimg.com/media/something.jpg',
 86 |         ...          'media_url_https': 'https://pbs.twimg.com/media/something.jpg',
 87 |         ...          'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600},
 88 |         ...          'medium': {'h': 799, 'resize': 'fit', 'w': 1200},
 89 |         ...          'small': {'h': 453, 'resize': 'fit', 'w': 680},
 90 |         ...          'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
 91 |         ...          'type': 'photo',
 92 |         ...          'url': 'https://t.co/something'},
 93 |         ...          {'display_url': 'pic.twitter.com/something_else',
 94 |         ...          'expanded_url': 'https://twitter.com/user/status/something/photo/1',
 95 |         ...          'id': 4243,
 96 |         ...          'id_str': '4243',
 97 |         ...          'indices': [88, 111],
 98 |         ...          'media_url': 'http://pbs.twimg.com/media/something_else.jpg',
 99 |         ...          'media_url_https': 'https://pbs.twimg.com/media/something_else.jpg',
100 |         ...          'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600},
101 |         ...          'medium': {'h': 799, 'resize': 'fit', 'w': 1200},
102 |         ...          'small': {'h': 453, 'resize': 'fit', 'w': 680},
103 |         ...          'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
104 |         ...          'type': 'photo',
105 |         ...          'url': 'https://t.co/something_else'}]}
106 |         ...         }
107 |         >>> get_media_entities(tweet)
108 |         [{'display_url': 'pic.twitter.com/something', 'expanded_url': 'https://twitter.com/something', 'id': 4242, 'id_str': '4242', 'indices': [88, 111], 'media_url': 'http://pbs.twimg.com/media/something.jpg', 'media_url_https': 'https://pbs.twimg.com/media/something.jpg', 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 'type': 'photo', 'url': 'https://t.co/something'}, {'display_url': 'pic.twitter.com/something_else', 'expanded_url': 'https://twitter.com/user/status/something/photo/1', 'id': 4243, 'id_str': '4243', 'indices': [88, 111], 'media_url': 'http://pbs.twimg.com/media/something_else.jpg', 'media_url_https': 'https://pbs.twimg.com/media/something_else.jpg', 'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600}, 'medium': {'h': 799, 'resize': 'fit', 'w': 1200}, 'small': {'h': 453, 'resize': 'fit', 'w': 680}, 'thumb': {'h': 150, 'resize': 'crop', 'w': 150}}, 'type': 'photo', 'url': 'https://t.co/something_else'}]
109 |     """
110 | 
111 |     ext_ents_key = "extended_entities" if is_original_format(tweet) else "twitter_extended_entities"
112 |     ext_ents = tweet.get(ext_ents_key)
113 |     media = ext_ents.get("media", []) if ext_ents else []
114 |     return media
115 | 
116 | 
117 | def get_media_urls(tweet):
118 |     """
119 |     Gets the https links to each media entity in the tweet.
120 | 
121 |     Args:
122 |         tweet (Tweet or dict): tweet
123 | 
124 |     Returns:
125 |         list: list of urls. Will be an empty list if there are no urls present.
126 | 
127 |     Example:
128 |         >>> from tweet_parser.getter_methods.tweet_entities import get_media_urls
129 |         >>> tweet = {'created_at': '2017-21-23T15:21:21.000Z',
130 |         ...          'entities': {'user_mentions': [{'id': 2382763597,
131 |         ...          'id_str': '2382763597',
132 |         ...          'indices': [14, 26],
133 |         ...          'name': 'Fiona',
134 |         ...          'screen_name': 'notFromShrek'}]},
135 |         ...          'extended_entities': {'media': [{'display_url': 'pic.twitter.com/something',
136 |         ...          'expanded_url': 'https://twitter.com/something',
137 |         ...          'id': 4242,
138 |         ...          'id_str': '4242',
139 |         ...          'indices': [88, 111],
140 |         ...          'media_url': 'http://pbs.twimg.com/media/something.jpg',
141 |         ...          'media_url_https': 'https://pbs.twimg.com/media/something.jpg',
142 |         ...          'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600},
143 |         ...          'medium': {'h': 799, 'resize': 'fit', 'w': 1200},
144 |         ...          'small': {'h': 453, 'resize': 'fit', 'w': 680},
145 |         ...          'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
146 |         ...          'type': 'photo',
147 |         ...          'url': 'https://t.co/something'},
148 |         ...          {'display_url': 'pic.twitter.com/something_else',
149 |         ...          'expanded_url': 'https://twitter.com/user/status/something/photo/1',
150 |         ...          'id': 4243,
151 |         ...          'id_str': '4243',
152 |         ...          'indices': [88, 111],
153 |         ...          'media_url': 'http://pbs.twimg.com/media/something_else.jpg',
154 |         ...          'media_url_https': 'https://pbs.twimg.com/media/something_else.jpg',
155 |         ...          'sizes': {'large': {'h': 1065, 'resize': 'fit', 'w': 1600},
156 |         ...          'medium': {'h': 799, 'resize': 'fit', 'w': 1200},
157 |         ...          'small': {'h': 453, 'resize': 'fit', 'w': 680},
158 |         ...          'thumb': {'h': 150, 'resize': 'crop', 'w': 150}},
159 |         ...          'type': 'photo',
160 |         ...          'url': 'https://t.co/something_else'}]}
161 |         ...         }
162 |         >>> get_media_urls(tweet)
163 |         ['https://pbs.twimg.com/media/something.jpg', 'https://pbs.twimg.com/media/something_else.jpg']
164 |     """
165 | 
166 |     media = get_media_entities(tweet)
167 |     urls = [m.get("media_url_https") for m in media] if media else []
168 |     return urls
169 | 
170 | 
171 | 
172 | def get_user_mentions(tweet):
173 |     """
174 |     Get the @-mentions in the Tweet as dictionaries.
175 |     Note that in the case of a quote-tweet, this does not return the users
176 |     mentioned in the quoted status. The recommended way to get that list would
177 |     be to use get_user_mentions on the quoted status.
178 |     Also note that in the caes of a quote-tweet, the list of @-mentioned users
179 |     does not include the user who authored the original (quoted) Tweet.
180 | 
181 |     Args:
182 |         tweet (Tweet or dict): A Tweet object or dictionary
183 | 
184 |     Returns:
185 |         list (list of dicts): 1 item per @ mention. Note that the fields here
186 |         aren't enforced by the parser, they are simply the fields as they
187 |         appear in a Tweet data payload.
188 | 
189 |     Example:
190 |         >>> from tweet_parser.getter_methods.tweet_entities import get_user_mentions
191 |         >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017",
192 |         ...             "text": "RT @notFromShrek: Stuff! Words! ...",
193 |         ...             "entities": {"user_mentions": [{
194 |         ...                              "indices": [2,12], #characters where the @ mention appears
195 |         ...                              "id_str": "2382763597", #id of @ mentioned user as a string
196 |         ...                              "screen_name": "notFromShrek", #screen_name of @d user
197 |         ...                              "name": "Fiona", #display name of @ mentioned user
198 |         ...                              "id": 2382763597 #id of @ mentioned user as an int
199 |         ...                            }]
200 |         ...                          },
201 |         ...             "retweeted_status": {
202 |         ...                 "created_at": "Wed May 24 20:01:19 +0000 2017",
203 |         ...                 "text": "Stuff! Words! #Tweeting!",
204 |         ...                 "entities": {"user_mentions": []}
205 |         ...                 }
206 |         ...             }
207 |         >>> get_user_mentions(original)
208 |         [{'indices': [2, 12], 'id_str': '2382763597', 'screen_name': 'notFromShrek', 'name': 'Fiona', 'id': 2382763597}]
209 |     """
210 |     entities = get_entities(tweet)
211 |     user_mentions = entities.get("user_mentions") if entities else None
212 |     return user_mentions if user_mentions else []
213 | 
214 | 
215 | def get_hashtags(tweet):
216 |     """
217 |     Get a list of hashtags in the Tweet
218 |     Note that in the case of a quote-tweet, this does not return the
219 |     hashtags in the quoted status.
220 | 
221 |     Args:
222 |         tweet (Tweet or dict): A Tweet object or dictionary
223 | 
224 |     Returns:
225 |         list (a list of strings): list of all of the hashtags in the Tweet
226 | 
227 |     Example:
228 |         >>> from tweet_parser.getter_methods.tweet_entities import get_hashtags
229 |         >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017",
230 |         ...            "entities": {"hashtags": [{"text":"1hashtag"}]}}
231 |         >>> get_hashtags(original)
232 |         ['1hashtag']
233 | 
234 |         >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z",
235 |         ...             "verb": "post",
236 |         ...             "twitter_entities": {"hashtags": [
237 |         ...                     {"text":"1hashtag"},
238 |         ...                     {"text": "moreHashtags"}]}}
239 |         >>> get_hashtags(activity)
240 |         ['1hashtag', 'moreHashtags']
241 |     """
242 |     entities = get_entities(tweet)
243 |     hashtags = entities.get("hashtags")
244 |     hashtags = [tag["text"] for tag in hashtags] if hashtags else []
245 |     return hashtags
246 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_generator.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Twitter, Inc.
 2 | # Licensed under the MIT License
 3 | # https://opensource.org/licenses/MIT
 4 | from tweet_parser.tweet_checking import is_original_format
 5 | import sys
 6 | if sys.version_info[0] == 3:
 7 |     from html.parser import HTMLParser
 8 | elif sys.version_info[0] == 2:
 9 |     from HTMLParser import HTMLParser
10 | 
11 | class GeneratorHTMLParser(HTMLParser):
12 |     """
13 |     HTML parser class to handle HTML tags in the original format source field
14 |     """
15 |     def handle_starttag(self, tag, attrs):
16 |         for attr in attrs:
17 |             if attr[0] == "href":
18 |                 self.generator_link = attr[1]
19 | 
20 |     def handle_data(self, data):
21 |         self.generator_name = data
22 | 
23 | 
24 | def get_generator(tweet):
25 |     """
26 |     Get information about the application that generated the Tweet
27 | 
28 |     Args:
29 |         tweet (Tweet): A Tweet object (or a dictionary)
30 | 
31 |     Returns:
32 |         dict: keys are 'link' and 'name', the web link and the name
33 |         of the application
34 | 
35 |     Example:
36 |         >>> from tweet_parser.getter_methods.tweet_generator import get_generator
37 |         >>> original_format_dict = {
38 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
39 |         ...             "source": '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>'
40 |         ...            }
41 |         >>> get_generator(original_format_dict)
42 |         {'link': 'http://twitter.com', 'name': 'Twitter Web Client'}
43 | 
44 |         >>> activity_streams_format_dict = {
45 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
46 |         ...             "generator":
47 |         ...              {"link": "http://twitter.com",
48 |         ...               "displayName": "Twitter Web Client"}
49 |         ...             }
50 |         >>> get_generator(activity_streams_format_dict)
51 |         {'link': 'http://twitter.com', 'name': 'Twitter Web Client'}
52 |     """
53 |     if is_original_format(tweet):
54 |         if sys.version_info[0] == 3 and sys.version_info[1] >= 4:
55 |             parser = GeneratorHTMLParser(convert_charrefs=True)
56 |         else:
57 |             parser = GeneratorHTMLParser()
58 |         parser.feed(tweet["source"])
59 |         return {"link": parser.generator_link,
60 |                 "name": parser.generator_name}
61 |     else:
62 |         return {"link": tweet["generator"]["link"],
63 |                 "name": tweet["generator"]["displayName"]}
64 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_geo.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2018 Twitter, Inc.
 2 | # Licensed under the MIT License
 3 | # https://opensource.org/licenses/MIT
 4 | from tweet_parser.tweet_checking import is_original_format
 5 | 
 6 | 
 7 | def get_geo_coordinates(tweet):
 8 |     """
 9 |     Get the user's geo coordinates, if they are included in the payload
10 |     (otherwise return None)
11 | 
12 |     Args:
13 |         tweet (Tweet or dict): A Tweet object or dictionary
14 | 
15 |     Returns:
16 |         dict: dictionary with the keys "latitude" and "longitude"
17 |               or, if unavaiable, None
18 | 
19 |     Example:
20 |         >>> from tweet_parser.getter_methods.tweet_geo import get_geo_coordinates
21 |         >>> tweet_geo = {"geo": {"coordinates": [1,-1]}}
22 |         >>> get_geo_coordinates(tweet_geo)
23 |         {'latitude': 1, 'longitude': -1}
24 | 
25 |         >>> tweet_no_geo = {"geo": {}}
26 |         >>> get_geo_coordinates(tweet_no_geo) #returns None
27 |     """
28 |     if "geo" in tweet:
29 |         if tweet["geo"] is not None:
30 |             if "coordinates" in tweet["geo"]:
31 |                 [lat, lon] = tweet["geo"]["coordinates"]
32 |                 return {"latitude": lat, "longitude": lon}
33 |     return None
34 | 
35 | 
36 | def get_profile_location(tweet):
37 |     """
38 |     Get user's derived location data from the profile location enrichment
39 |     If unavailable, returns None.
40 | 
41 |     Args:
42 |         tweet (Tweet or dict): Tweet object or dictionary
43 | 
44 |     Returns:
45 |         dict: more information on the profile locations enrichment here:
46 |         http://support.gnip.com/enrichments/profile_geo.html
47 | 
48 |     Example:
49 |         >>> result = {"country": "US",         # Two letter ISO-3166 country code
50 |         ...           "locality": "Boulder",   # The locality location (~ city)
51 |         ...           "region": "Colorado",    # The region location (~ state/province)
52 |         ...           "sub_region": "Boulder", # The sub-region location (~ county)
53 |         ...           "full_name": "Boulder, Colorado, US", # The full name (excluding sub-region)
54 |         ...           "geo":  [40,-105]        # lat/long value that coordinate that corresponds to
55 |         ...                            # the lowest granularity location for where the user
56 |         ...                            # who created the Tweet is from
57 |         ...  }
58 | 
59 |     Caveats:
60 |         This only returns the first element of the 'locations' list.
61 |         I'm honestly not sure what circumstances would result in a list that
62 |         is more than one element long.
63 |     """
64 |     if is_original_format(tweet):
65 |         try:
66 |             return tweet["user"]["derived"]["locations"][0]
67 |         except KeyError:
68 |             return None
69 |     else:
70 |         try:
71 |             location = tweet["gnip"]["profileLocations"][0]
72 |             reconstructed_original_format = {}
73 |             if location["address"].get("country", None) is not None:
74 |                 reconstructed_original_format["country"] = location["address"]["country"]
75 |             if location["address"].get("countryCode", None) is not None:
76 |                 reconstructed_original_format["country_code"] = location["address"]["countryCode"]
77 |             if location["address"].get("locality", None) is not None:
78 |                 reconstructed_original_format["locality"] = location["address"]["locality"]
79 |             if location["address"].get("region", None) is not None:
80 |                 reconstructed_original_format["region"] = location["address"]["region"]
81 |             if location["address"].get("subRegion", None) is not None:
82 |                 reconstructed_original_format["sub_region"] = location["address"]["subRegion"]
83 |             if location.get("displayName", None) is not None:
84 |                 reconstructed_original_format["full_name"] = location["displayName"]
85 |             if location.get("geo", None) is not None:
86 |                 reconstructed_original_format["geo"] = location["geo"]
87 |             return reconstructed_original_format
88 |         except KeyError:
89 |             return None
90 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_links.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Twitter, Inc.
  2 | # Licensed under the MIT License
  3 | # https://opensource.org/licenses/MIT
  4 | from tweet_parser.tweet_checking import is_original_format
  5 | 
  6 | 
  7 | def get_tweet_links(tweet):
  8 |     """
  9 |     Get the links that are included in the Tweet as "urls"
 10 |     (if there are no links in the Tweet, this returns an empty list)
 11 |     This includes links that are included in quoted or retweeted Tweets
 12 |     Returns unrolled or expanded_url information if it is available
 13 | 
 14 |     Args:
 15 |         tweet (Tweet): A Tweet object (must be a Tweet obj, not a dict)
 16 | 
 17 |     Returns:
 18 |         list (list of dicts): A list of dictionaries containing information
 19 |         about urls. Each dictionary entity can have these keys; without
 20 |         unwound url or expanded url Twitter data enrichments many of these
 21 |         fields will be missing. \n
 22 |         More information about the Twitter url enrichments at:
 23 |         http://support.gnip.com/enrichments/expanded_urls.html and
 24 |         http://support.gnip.com/enrichments/enhanced_urls.html
 25 | 
 26 |     Example:
 27 |         >>> result = [
 28 |         ...   {
 29 |         ...   # url that shows up in the tweet text
 30 |         ...   'display_url': "https://twitter.com/RobotPrinc...",
 31 |         ...   # long (expanded) url
 32 |         ...   'expanded_url': "https://twitter.com/RobotPrincessFi",
 33 |         ...   # characters where the display link is
 34 |         ...   'indices': [55, 88],
 35 |         ...   'unwound': {
 36 |         ...      # description from the linked webpage
 37 |         ...      'description': "the Twitter profile of RobotPrincessFi",
 38 |         ...      'status': 200,
 39 |         ...      # title of the webpage
 40 |         ...      'title': "the Twitter profile of RobotPrincessFi",
 41 |         ...      # long (expanded) url}
 42 |         ...      'url': "https://twitter.com/RobotPrincessFi"},
 43 |         ...   # the url that tweet directs to, often t.co
 44 |         ...   'url': "t.co/1234"}]
 45 |     """
 46 |     if is_original_format(tweet):
 47 |         # get the urls from the Tweet
 48 |         try:
 49 |             tweet_urls = tweet["entities"]["urls"]
 50 |         except KeyError:
 51 |             tweet_urls = []
 52 |         # get the urls from the quote-tweet
 53 |         if tweet.quoted_tweet is not None:
 54 |             tweet_urls += tweet.quoted_tweet.tweet_links
 55 |         # get the urls from the retweet
 56 |         if tweet.retweeted_tweet is not None:
 57 |             tweet_urls += tweet.retweeted_tweet.tweet_links
 58 |         return tweet_urls
 59 |     else:
 60 |         # try to get normal urls
 61 |         try:
 62 |             tweet_urls = tweet["twitter_entities"]["urls"]
 63 |         except KeyError:
 64 |             tweet_urls = []
 65 |         # get the urls from the quote-tweet
 66 |         if tweet.quoted_tweet is not None:
 67 |             tweet_urls += tweet.quoted_tweet.tweet_links
 68 |         # get the urls from the retweet
 69 |         if tweet.retweeted_tweet is not None:
 70 |             tweet_urls += tweet.retweeted_tweet.tweet_links
 71 |         # otherwise, we're now going to combine the urls to try to
 72 |         # to get the same format as the og format urls, try to get enriched urls
 73 |         try:
 74 |             gnip_tweet_urls = {x["url"]: x for x in tweet["gnip"]["urls"]}
 75 |             gnip_tweet_exp_urls = {x["expanded_url"]: x for x in tweet["gnip"]["urls"]}
 76 |         except KeyError:
 77 |             return tweet_urls
 78 |         key_mappings = {"expanded_url": "url",
 79 |                         "expanded_status": "status",
 80 |                         "expanded_url_title": "title",
 81 |                         "expanded_url_description": "description"}
 82 |         tweet_urls_expanded = []
 83 |         for url in tweet_urls:
 84 |             expanded_url = url
 85 |             if url["url"] in gnip_tweet_urls:
 86 |                 expanded_url["unwound"] = {key_mappings[key]: value for key, value in gnip_tweet_urls[url["url"]].items() if key != "url"}
 87 |             elif url.get("expanded_url", "UNAVAILABLE") in gnip_tweet_exp_urls:
 88 |                 expanded_url["unwound"] = {key_mappings[key]: value for key, value in gnip_tweet_urls[url["expanded_url"]].items() if key != "url"}
 89 |             tweet_urls_expanded.append(expanded_url)
 90 |         return tweet_urls_expanded
 91 | 
 92 | 
 93 | def get_most_unrolled_urls(tweet):
 94 |     """
 95 |     For each url included in the Tweet "urls", get the most unrolled
 96 |     version available. Only return 1 url string per url in tweet.tweet_links
 97 |     In order of preference for "most unrolled"
 98 |     (keys from the dict at tweet.tweet_links): \n
 99 |     1. `unwound`/`url` \n
100 |     2. `expanded_url` \n
101 |     3. `url`
102 | 
103 |     Args:
104 |         tweet (Tweet): A Tweet object or dict
105 | 
106 |     Returns:
107 |         list (list of strings): a list of the most unrolled url available
108 |     """
109 |     unrolled_urls = []
110 |     for url in get_tweet_links(tweet):
111 |         if url.get("unwound", {"url": None}).get("url", None) is not None:
112 |             unrolled_urls.append(url["unwound"]["url"])
113 |         elif url.get("expanded_url", None) is not None:
114 |             unrolled_urls.append(url["expanded_url"])
115 |         else:
116 |             unrolled_urls.append(url["url"])
117 |     return unrolled_urls
118 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_reply.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 Twitter, Inc.
  2 | # Licensed under the MIT License
  3 | # https://opensource.org/licenses/MIT
  4 | from tweet_parser.tweet_checking import is_original_format
  5 | from tweet_parser.tweet_parser_errors import NotAvailableError
  6 | 
  7 | 
  8 | def get_in_reply_to_screen_name(tweet):
  9 |     """
 10 |     Get the screen name of the user whose Tweet is being replied to, None
 11 |     if this Tweet is not a reply
 12 | 
 13 |     Args:
 14 |         tweet (Tweet): A Tweet object (or a dictionary)
 15 | 
 16 |     Returns:
 17 |         str: the screen name of the user whose Tweet is being replied to
 18 |         (None if not a reply)
 19 | 
 20 |     Example:
 21 |         >>> from tweet_parser.getter_methods.tweet_reply import *
 22 |         >>> original_format_dict = {
 23 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 24 |         ...             "in_reply_to_screen_name": "notFromShrek"
 25 |         ...            }
 26 |         >>> get_in_reply_to_screen_name(original_format_dict)
 27 |         'notFromShrek'
 28 | 
 29 |         >>> activity_streams_format_dict = {
 30 |         ...         "postedTime": "2017-05-24T20:17:19.000Z",
 31 |         ...         "inReplyTo":
 32 |         ...            {"link": "http://twitter.com/notFromShrek/statuses/863566329168711681"}
 33 |         ...         }
 34 |         >>> get_in_reply_to_screen_name(activity_streams_format_dict)
 35 |         'notFromShrek'
 36 |     """
 37 | 
 38 |     if is_original_format(tweet):
 39 |         return tweet["in_reply_to_screen_name"]
 40 |     else:
 41 |         if tweet.get("inReplyTo", None) is not None:
 42 |             return tweet["inReplyTo"]["link"].split("/")[-3]
 43 |         else:
 44 |             return None
 45 | 
 46 | 
 47 | def get_in_reply_to_user_id(tweet):
 48 |     """
 49 |     Get the user id of the uesr whose Tweet is being replied to, and None
 50 |     if this Tweet is not a reply. \n
 51 |     Note that this is unavailable in activity-streams format
 52 | 
 53 |     Args:
 54 |         tweet (Tweet): A Tweet object (or a dictionary)
 55 | 
 56 |     Returns:
 57 |         str: the user id of the user whose Tweet is being replied to, None
 58 |         (if not a reply), or for activity-streams raise a NotAvailableError
 59 | 
 60 |     Example:
 61 |         >>> from tweet_parser.getter_methods.tweet_reply import *
 62 |         >>> original_format_dict = {
 63 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 64 |         ...             "in_reply_to_user_id_str": "2382763597"
 65 |         ...            }
 66 |         >>> get_in_reply_to_user_id(original_format_dict)
 67 |         '2382763597'
 68 |     """
 69 | 
 70 |     if is_original_format(tweet):
 71 |         return tweet["in_reply_to_user_id_str"]
 72 |     else:
 73 |         raise NotAvailableError("Gnip activity-streams format does not" +
 74 |                                 " return the replied to user's id")
 75 | 
 76 | 
 77 | def get_in_reply_to_status_id(tweet):
 78 |     """
 79 |     Get the tweet id of the Tweet being replied to, None
 80 |     if this Tweet is not a reply
 81 | 
 82 |     Args:
 83 |         tweet (Tweet): A Tweet object (or a dictionary)
 84 | 
 85 |     Returns:
 86 |         str: the tweet id of the Tweet being replied to
 87 |         (None if not a reply)
 88 | 
 89 |     Example:
 90 |         >>> from tweet_parser.getter_methods.tweet_reply import *
 91 |         >>> original_format_dict = {
 92 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 93 |         ...             "in_reply_to_status_id_str": "863566329168711681"
 94 |         ...            }
 95 |         >>> get_in_reply_to_status_id(original_format_dict)
 96 |         '863566329168711681'
 97 | 
 98 |         >>> activity_streams_format_dict = {
 99 |         ...         "postedTime": "2017-05-24T20:17:19.000Z",
100 |         ...         "inReplyTo":
101 |         ...            {"link": "http://twitter.com/notFromShrek/statuses/863566329168711681"}
102 |         ...         }
103 |         >>> get_in_reply_to_status_id(activity_streams_format_dict)
104 |         '863566329168711681'
105 |     """
106 |     if is_original_format(tweet):
107 |         return tweet["in_reply_to_status_id_str"]
108 |     else:
109 |         if tweet.get("inReplyTo", None) is not None:
110 |             return tweet["inReplyTo"]["link"].split("/")[-1]
111 |         else:
112 |             return None
113 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_text.py:
--------------------------------------------------------------------------------
  1 | from tweet_parser.tweet_checking import is_original_format
  2 | from tweet_parser.tweet_parser_errors import NotAvailableError
  3 | import re
  4 | 
  5 | 
  6 | def get_full_text(tweet):
  7 |     """
  8 |     Get the full text of a tweet dict.
  9 |     Includes @-mention replies and long links.
 10 | 
 11 |     Args:
 12 |         tweet (Tweet or dict): A Tweet object or dictionary
 13 | 
 14 |     Returns:
 15 |         str: the untruncated text of a Tweet
 16 |         (finds extended text if available)
 17 | 
 18 |     Example:
 19 |         >>> from tweet_parser.getter_methods.tweet_text import get_full_text
 20 |         >>> # getting the text of a Tweet that is not truncated
 21 |         >>> original_untruncated = {
 22 |         ...                 "created_at": "Wed May 24 20:17:19 +0000 2017",
 23 |         ...                 "truncated": False,
 24 |         ...                 "text": "some tweet text"
 25 |         ...                }
 26 |         >>> get_full_text(original_untruncated)
 27 |         'some tweet text'
 28 | 
 29 |         >>> activity_untruncated = {"postedTime": "2017-05-24T20:17:19.000Z",
 30 |         ...                         "body": "some tweet text"
 31 |         ...                        }
 32 |         >>> get_full_text(activity_untruncated)
 33 |         'some tweet text'
 34 | 
 35 |         >>> # getting the text of a truncated Tweet (has over 140 chars)
 36 |         >>> original_truncated = {
 37 |         ...               "created_at": "Wed May 24 20:17:19 +0000 2017",
 38 |         ...               "text": "some tweet text, lorem ip...",
 39 |         ...               "truncated": True,
 40 |         ...               "extended_tweet":
 41 |         ...                 {"full_text":
 42 |         ...                   "some tweet text, lorem ipsum dolor sit amet"}
 43 |         ...               }
 44 |         >>> get_full_text(original_truncated)
 45 |         'some tweet text, lorem ipsum dolor sit amet'
 46 | 
 47 |         >>> activity_truncated = {
 48 |         ...               "postedTime": "2017-05-24T20:17:19.000Z",
 49 |         ...               "body": "some tweet text, lorem ip...",
 50 |         ...               "long_object":
 51 |         ...                 {"body":
 52 |         ...                   "some tweet text, lorem ipsum dolor sit amet"}
 53 |         ...              }
 54 |         >>> get_full_text(activity_truncated)
 55 |         'some tweet text, lorem ipsum dolor sit amet'
 56 |     """
 57 |     if is_original_format(tweet):
 58 |         if tweet["truncated"]:
 59 |             return tweet["extended_tweet"]["full_text"]
 60 |         else:
 61 |             return tweet["text"]
 62 |     else:
 63 |         if "long_object" in tweet:
 64 |             return tweet["long_object"]["body"]
 65 |         else:
 66 |             return tweet["body"]
 67 | 
 68 | 
 69 | def get_text(tweet):
 70 |     """
 71 |     Get the contents of "text" (original format)
 72 |     or "body" (activity streams format)
 73 | 
 74 |     Args:
 75 |         tweet (Tweet or dict): A Tweet object or dictionary
 76 | 
 77 |     Returns:
 78 |         str: the contents of "text" key (original format)
 79 |         or "body" key (activity streams format)
 80 | 
 81 |     Example:
 82 |         >>> from tweet_parser.getter_methods.tweet_text import get_text
 83 |         >>> original = {
 84 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 85 |         ...             "text": "some tweet text"}
 86 |         >>> get_text(original)
 87 |         'some tweet text'
 88 | 
 89 |         >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z",
 90 |         ...             "body": "some tweet text"}
 91 |         >>> get_text(activity)
 92 |         'some tweet text'
 93 |     """
 94 |     if is_original_format(tweet):
 95 |         return tweet["text"]
 96 |     else:
 97 |         return tweet["body"]
 98 | 
 99 | 
100 | def get_tweet_type(tweet):
101 |     """
102 |     Get the type of Tweet this is (3 options: tweet, quote, and retweet)
103 | 
104 |     Args:
105 |         tweet (Tweet or dict): A Tweet object or dictionary
106 | 
107 |     Returns:
108 |         str: (one of 3 strings)
109 |         "tweet": an original Tweet
110 |         "retweet": a native retweet (created with the retweet button)
111 |         "quote": a native quote tweet (etweet button + adding quote text)
112 | 
113 |     Caveats:
114 |         When a quote-tweet (tweet A) is quote-tweeted (tweet B),
115 |         the innermost quoted tweet (A) in the payload (for B)
116 |         no longer has the key "quoted_status" or "twitter_quoted_status",
117 |         and that tweet (A) would be labeled as a "tweet" (not a "quote").
118 |     """
119 |     if is_original_format(tweet):
120 |         if "retweeted_status" in tweet:
121 |             return "retweet"
122 |         elif "quoted_status" in tweet:
123 |             return "quote"
124 |         else:
125 |             return "tweet"
126 |     else:
127 |         if tweet["verb"] == "share":
128 |             return "retweet"
129 |         else:
130 |             if "twitter_quoted_status" in tweet:
131 |                 return "quote"
132 |             else:
133 |                 return "tweet"
134 | 
135 | 
136 | def get_lang(tweet):
137 |     """
138 |     Get the language that the Tweet is written in.
139 | 
140 |     Args:
141 |         tweet (Tweet or dict): A Tweet object or dictionary
142 | 
143 |     Returns:
144 |         str: 2-letter BCP 47 language code (or None if undefined)
145 | 
146 |     Example:
147 |         >>> from tweet_parser.getter_methods.tweet_text import get_lang
148 |         >>> original = {"created_at": "Wed May 24 20:17:19 +0000 2017",
149 |         ...             "lang": "en"}
150 |         >>> get_lang(original)
151 |         'en'
152 | 
153 |         >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z",
154 |         ...             "twitter_lang": "en"}
155 |         >>> get_lang(activity)
156 |         'en'
157 |     """
158 |     if is_original_format(tweet):
159 |         lang_field = "lang"
160 |     else:
161 |         lang_field = "twitter_lang"
162 |     if tweet[lang_field] is not None and tweet[lang_field] != "und":
163 |             return tweet[lang_field]
164 |     else:
165 |         return None
166 | 
167 | 
168 | def get_poll_options(tweet):
169 |     """
170 |     Get the text in the options of a poll as a list
171 |     - If there is no poll in the Tweet, return an empty list
172 |     - If the Tweet is in activity-streams format, raise 'NotAvailableError'
173 | 
174 |     Args:
175 |         tweet (Tweet or dict): A Tweet object or dictionary
176 | 
177 |     Returns:
178 |         list: list of strings, or, in the case where there is no poll,
179 |         an empty list
180 | 
181 |     Raises:
182 |         NotAvailableError for activity-streams format
183 | 
184 |     Example:
185 |         >>> from tweet_parser.getter_methods.tweet_text import get_poll_options
186 |         >>> original = {
187 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
188 |         ...             "entities": {"polls": [{"options": [{"text":"a"},
189 |         ...                                                 {"text":"b"},
190 |         ...                                                 {"text":"c"}]
191 |         ...                             }]},
192 |         ...            }
193 |         >>> get_poll_options(original)
194 |         ['a', 'b', 'c']
195 | 
196 |         >>> activity = {"postedTime": "2017-05-24T20:17:19.000Z",
197 |         ...             "body": "some tweet text"}
198 |         >>> get_poll_options(activity)
199 |         Traceback (most recent call last):
200 |         ...
201 |         NotAvailableError: Gnip activity-streams format does not return poll options
202 |     """
203 |     if is_original_format(tweet):
204 |         try:
205 |             poll_options_text = []
206 |             for p in tweet["entities"]["polls"]:
207 |                 for o in p["options"]:
208 |                     poll_options_text.append(o["text"])
209 |             return poll_options_text
210 |         except KeyError:
211 |             return []
212 | 
213 |     else:
214 |         raise NotAvailableError("Gnip activity-streams format does not" +
215 |                                 " return poll options")
216 | 
217 | 
218 | def get_quote_or_rt_text(tweet):
219 |     """
220 |     Get the quoted or retweeted text in a Tweet
221 |     (this is not the text entered by the posting user)
222 |     - tweet: empty string (there is no quoted or retweeted text)
223 |     - quote: only the text of the quoted Tweet
224 |     - retweet: the text of the retweet
225 | 
226 |     Args:
227 |         tweet (Tweet or dict): A Tweet object or dictionary
228 | 
229 |     Returns:
230 |         str: text of the retweeted-tweet or the quoted-tweet
231 |         (empty string if this is an original Tweet)
232 | 
233 |     Example:
234 |         >>> from tweet_parser.getter_methods.tweet_text import get_quote_or_rt_text
235 |         >>> # a quote tweet
236 |         >>> quote = {"created_at": "Wed May 24 20:17:19 +0000 2017",
237 |         ...          "text": "adding my own commentary",
238 |         ...          "truncated": False,
239 |         ...          "quoted_status": {
240 |         ...                 "created_at": "Mon May 01 05:00:05 +0000 2017",
241 |         ...                 "truncated": False,
242 |         ...                 "text": "an interesting Tweet"
243 |         ...                }
244 |         ...         }
245 | 
246 |         >>> get_quote_or_rt_text(quote)
247 |         'an interesting Tweet'
248 |     """
249 |     tweet_type = get_tweet_type(tweet)
250 |     if tweet_type == "tweet":
251 |         return ""
252 |     if tweet_type == "quote":
253 |         if is_original_format(tweet):
254 |             return get_full_text(tweet["quoted_status"])
255 |         else:
256 |             return get_full_text(tweet["twitter_quoted_status"])
257 |     if tweet_type == "retweet":
258 |         if is_original_format(tweet):
259 |             return get_full_text(tweet["retweeted_status"])
260 |         else:
261 |             return get_full_text(tweet["object"])
262 | 
263 | 
264 | def get_all_text(tweet):
265 |     """
266 |     Get all of the text of the tweet. This includes @ mentions, long links,
267 |     quote-tweet contents (separated by a newline), RT contents & poll options
268 | 
269 |     Args:
270 |         tweet (Tweet): A Tweet object (must be a Tweet object)
271 | 
272 |     Returns:
273 |         str: text from tweet.user_entered_text, tweet.quote_or_rt_text and
274 |         tweet.poll_options (if in original format), separated by newlines
275 |     """
276 |     if is_original_format(tweet):
277 |         return "\n".join(filter(None, [tweet.user_entered_text,
278 |                                        tweet.quote_or_rt_text,
279 |                                        "\n".join(tweet.poll_options)]))
280 |     else:
281 |         return "\n".join(filter(None, [tweet.user_entered_text,
282 |                                        tweet.quote_or_rt_text]))
283 | 
284 | 
285 | def remove_links(text):
286 |     """
287 |     Helper function to remove the links from the input text
288 | 
289 |     Args:
290 |         text (str): A string
291 | 
292 |     Returns:
293 |         str: the same text, but with any substring that matches the regex
294 |         for a link removed and replaced with a space
295 | 
296 |     Example:
297 |         >>> from tweet_parser.getter_methods.tweet_text import remove_links
298 |         >>> text = "lorem ipsum dolor https://twitter.com/RobotPrincessFi"
299 |         >>> remove_links(text)
300 |         'lorem ipsum dolor  '
301 |     """
302 |     tco_link_regex = re.compile("https?://t.co/[A-z0-9].*")
303 |     generic_link_regex = re.compile("(https?://)?(\w*[.]\w+)+([/?=&]+\w+)*")
304 |     remove_tco = re.sub(tco_link_regex, " ", text)
305 |     remove_generic = re.sub(generic_link_regex, " ", remove_tco)
306 |     return remove_generic
307 | 


--------------------------------------------------------------------------------
/tweet_parser/getter_methods/tweet_user.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright 2018 Twitter, Inc.
  3 | # Licensed under the MIT License
  4 | # https://opensource.org/licenses/MIT
  5 | 
  6 | from tweet_parser.tweet_checking import is_original_format
  7 | from tweet_parser.deprecator import deprecated
  8 | 
  9 | def get_user_id(tweet):
 10 |     """
 11 |     Get the Twitter ID of the user who posted the Tweet
 12 | 
 13 |     Args:
 14 |         tweet (Tweet): A Tweet object (or a dictionary)
 15 | 
 16 |     Returns:
 17 |         str: the Twitter ID of the user who posted the Tweet
 18 | 
 19 |     Example:
 20 |         >>> from tweet_parser.getter_methods.tweet_user import get_user_id
 21 |         >>> original_format_dict = {
 22 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 23 |         ...             "user":
 24 |         ...              {"id_str": "815279070241955840"}
 25 |         ...            }
 26 |         >>> get_user_id(original_format_dict)
 27 |         '815279070241955840'
 28 | 
 29 |         >>> activity_streams_format_dict = {
 30 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
 31 |         ...             "actor":
 32 |         ...              {"id": "id:twitter.com:815279070241955840"}
 33 |         ...             }
 34 |         >>> get_user_id(activity_streams_format_dict)
 35 |         '815279070241955840'
 36 |     """
 37 | 
 38 |     if is_original_format(tweet):
 39 |         return tweet["user"]["id_str"]
 40 |     else:
 41 |         return tweet["actor"]["id"].split(":")[-1]
 42 | 
 43 | 
 44 | def get_screen_name(tweet):
 45 |     """
 46 |     Get the screen name (@ handle) of the user who posted the Tweet
 47 | 
 48 |     Args:
 49 |         tweet (Tweet): A Tweet object (or a dictionary)
 50 | 
 51 |     Returns:
 52 |         str: the @ handle of the user who posted the Tweet
 53 | 
 54 |     Example:
 55 |         >>> from tweet_parser.getter_methods.tweet_user import get_screen_name
 56 |         >>> original_format_dict = {
 57 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 58 |         ...             "user":
 59 |         ...              {"screen_name": "RobotPrincessFi"}
 60 |         ...            }
 61 |         >>> get_screen_name(original_format_dict)
 62 |         'RobotPrincessFi'
 63 | 
 64 |         >>> activity_streams_format_dict = {
 65 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
 66 |         ...             "actor":
 67 |         ...              {"preferredUsername": "RobotPrincessFi"}
 68 |         ...             }
 69 |         >>> get_screen_name(activity_streams_format_dict)
 70 |         'RobotPrincessFi'
 71 |     """
 72 | 
 73 |     if is_original_format(tweet):
 74 |         return tweet["user"]["screen_name"]
 75 |     else:
 76 |         return tweet["actor"]["preferredUsername"]
 77 | 
 78 | 
 79 | def get_name(tweet):
 80 |     """
 81 |     Get the display name of the user who posted the Tweet
 82 | 
 83 |     Args:
 84 |         tweet (Tweet): A Tweet object (or a dictionary)
 85 | 
 86 |     Returns:
 87 |         str: the @ handle of the user who posted the Tweet
 88 | 
 89 |     Example:
 90 |         >>> from tweet_parser.getter_methods.tweet_user import get_name
 91 |         >>> original_format_dict = {
 92 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
 93 |         ...             "user":
 94 |         ...              {"name": "jk no"}
 95 |         ...            }
 96 |         >>> get_name(original_format_dict)
 97 |         'jk no'
 98 | 
 99 |         >>> activity_streams_format_dict = {
100 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
101 |         ...             "actor":
102 |         ...              {"displayName": "jk no"}
103 |         ...             }
104 |         >>> get_name(activity_streams_format_dict)
105 |         'jk no'
106 |     """
107 | 
108 |     if is_original_format(tweet):
109 |         return tweet["user"]["name"]
110 |     else:
111 |         return tweet["actor"]["displayName"]
112 | 
113 | 
114 | def get_bio(tweet):
115 |     """
116 |     Get the bio text of the user who posted the Tweet
117 | 
118 |     Args:
119 |         tweet (Tweet): A Tweet object (or a dictionary)
120 | 
121 |     Returns:
122 |         str: the bio text of the user who posted the Tweet
123 |         In a payload the abscence of a bio seems to be represented by an
124 |         empty string or a None, this getter always returns a string (so, empty
125 |         string if no bio is available).
126 | 
127 |     Example:
128 |         >>> from tweet_parser.getter_methods.tweet_user import get_bio
129 |         >>> original_format_dict = {
130 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
131 |         ...             "user":
132 |         ...              {"description": "Niche millenial content aggregator"}
133 |         ...            }
134 |         >>> get_bio(original_format_dict)
135 |         'Niche millenial content aggregator'
136 | 
137 |         >>> activity_streams_format_dict = {
138 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
139 |         ...             "actor":
140 |         ...              {"summary": "Niche millenial content aggregator"}
141 |         ...             }
142 |         >>> get_bio(activity_streams_format_dict)
143 |         'Niche millenial content aggregator'
144 |     """
145 | 
146 |     if is_original_format(tweet):
147 |         bio_or_none = tweet["user"].get("description", "")
148 |     else:
149 |         bio_or_none = tweet["actor"].get("summary", "")
150 |     if bio_or_none is None:
151 |         return ""
152 |     else:
153 |         return bio_or_none
154 | 
155 | 
156 | def get_follower_count(tweet):
157 |     """
158 |     Get the number of followers that the user has
159 | 
160 |     Args:
161 |         tweet (Tweet): A Tweet object (or a dictionary)
162 | 
163 |     Returns:
164 |         int: the number of followers that the user has
165 | 
166 |     Example:
167 |         >>> from tweet_parser.getter_methods.tweet_user import get_follower_count
168 |         >>> original_format_dict = {
169 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
170 |         ...             "user":
171 |         ...              {"followers_count": 2}
172 |         ...            }
173 |         >>> get_follower_count(original_format_dict)
174 |         2
175 | 
176 |         >>> activity_streams_format_dict = {
177 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
178 |         ...             "actor":
179 |         ...              {"followersCount": 2}
180 |         ...             }
181 |         >>> get_follower_count(activity_streams_format_dict)
182 |         2    
183 |     """
184 |     if is_original_format(tweet):
185 |         return tweet["user"]["followers_count"]
186 |     else:
187 |         return tweet["actor"]["followersCount"]
188 | 
189 | 
190 | def get_following_count(tweet):
191 |     """
192 |     Get the number of accounts that the user is following
193 | 
194 |     Args:
195 |         tweet (Tweet): A Tweet object (or a dictionary)
196 | 
197 |     Returns:
198 |         int: the number of accounts that the user is following
199 | 
200 |     Example:
201 |         >>> from tweet_parser.getter_methods.tweet_user import get_following_count
202 |         >>> original_format_dict = {
203 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
204 |         ...             "user":
205 |         ...              {"friends_count": 2}
206 |         ...            }
207 |         >>> get_following_count(original_format_dict)
208 |         2
209 | 
210 |         >>> activity_streams_format_dict = {
211 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
212 |         ...             "actor":
213 |         ...              {"friendsCount": 2}
214 |         ...             }
215 |         >>> get_following_count(activity_streams_format_dict)
216 |         2    
217 |     """
218 |     if is_original_format(tweet):
219 |         return tweet["user"]["friends_count"]
220 |     else:
221 |         return tweet["actor"]["friendsCount"]
222 | 
223 | 
224 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout")
225 | def get_klout_score(tweet):
226 |     """
227 |     Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n
228 |     See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n
229 | 
230 |     Get the Klout score (int) (if it exists) of the user who posted the Tweet
231 | 
232 |     Args:
233 |         tweet (Tweet): A Tweet object (or a dictionary)
234 | 
235 |     Returns:
236 |         int: the Klout score (if it exists) of the user who posted the Tweet
237 |             else return None
238 | 
239 |     Example:
240 |         >>> from tweet_parser.getter_methods.tweet_user import get_klout_score
241 |         >>> original_format_dict = {
242 |         ...             "created_at": "Wed May 24 20:17:19 +0000 2017",
243 |         ...             "user":
244 |         ...              {"derived": {"klout": {"score": 12345}}}
245 |         ...            }
246 |         >>> get_klout_score(original_format_dict)
247 |         12345
248 | 
249 |         >>> activity_streams_format_dict = {
250 |         ...             "postedTime": "2017-05-24T20:17:19.000Z",
251 |         ...             "gnip":{"klout_score": 12345}}
252 |         >>> get_klout_score(activity_streams_format_dict)
253 |         12345
254 |     """
255 |     try:
256 |         if is_original_format(tweet):
257 |             score = tweet['user']['derived']['klout']['score']
258 |         else:
259 |             score = tweet['gnip']['klout_score']
260 |         return score
261 |     except KeyError:
262 |         return None
263 | 
264 | 
265 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout")
266 | def get_klout_profile(tweet):
267 |     """
268 |     Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n
269 |     See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n
270 |     
271 |     Get the Klout profile URL of the user (str) (if it exists)
272 | 
273 |     Args:
274 |         tweet (Tweet): A Tweet object (or a dictionary)
275 | 
276 |     Returns:
277 |         str: the user's Klout profile URL (if it exists), else return None
278 | 
279 |     Example:
280 |         >>> from tweet_parser.getter_methods.tweet_user import get_klout_profile
281 |         >>> original_format_dict = {
282 |         ... "created_at": "Wed May 24 20:17:19 +0000 2017",
283 |         ... "user":
284 |         ...     {"derived": {"klout":
285 |         ...         {"profile_url":
286 |         ...             "http://klout.com/topic/id/10000000000000016635"}}}
287 |         ... }
288 |         >>> get_klout_profile(original_format_dict)
289 |         'http://klout.com/topic/id/10000000000000016635'
290 | 
291 |         >>> activity_streams_format_dict = {
292 |         ... "postedTime": "2017-05-24T20:17:19.000Z",
293 |         ... "gnip":
294 |         ...     {"klout_profile": {
295 |         ...         "link": "http://klout.com/topic/id/10000000000000016635"}
296 |         ...     }
297 |         ... }
298 |         >>> get_klout_profile(activity_streams_format_dict)
299 |         'http://klout.com/topic/id/10000000000000016635'
300 |     """
301 |     try:
302 |         if is_original_format(tweet):
303 |             profile = tweet['user']['derived']['klout']['profile_url']
304 |         else:
305 |             profile = tweet['gnip']['klout_profile']['link']
306 |         return profile
307 |     except KeyError:
308 |         return None
309 | 
310 | 
311 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout")
312 | def get_klout_id(tweet):
313 |     """
314 |     Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n
315 |     See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n
316 |     
317 |     Get the Klout ID of the user (str) (if it exists)
318 | 
319 |     Args:
320 |         tweet (Tweet): A Tweet object (or a dictionary)
321 | 
322 |     Returns:
323 |         str: the user's Klout ID (if it exists), else return None
324 | 
325 |     Example:
326 |         >>> from tweet_parser.getter_methods.tweet_user import get_klout_id
327 |         >>> original_format_dict = {
328 |         ... "created_at": "Wed May 24 20:17:19 +0000 2017",
329 |         ...     "user":
330 |         ...         {"derived": {"klout":
331 |         ...             {"user_id":"1234567890"}}}
332 |         ...     }
333 |         >>> get_klout_id(original_format_dict)
334 |         '1234567890'
335 | 
336 |         >>> activity_streams_format_dict = {
337 |         ... "postedTime": "2017-05-24T20:17:19.000Z",
338 |         ... "gnip":
339 |         ...     {"klout_profile": {
340 |         ...         "klout_user_id": "1234567890"}
341 |         ...     }}
342 |         >>> get_klout_id(activity_streams_format_dict)
343 |         '1234567890'
344 |     """
345 |     try:
346 |         if is_original_format(tweet):
347 |             klout_id = tweet['user']['derived']['klout']['user_id']
348 |         else:
349 |             klout_id = tweet['gnip']['klout_profile']['klout_user_id']
350 |         return klout_id
351 |     except KeyError:
352 |         return None
353 | 
354 | 
355 | @deprecated("See: https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout")
356 | def get_klout_topics(tweet, topic_type='influence'):
357 |     """
358 |     Warning: Klout is deprecated and is being removed from Tweet payloads May 2018. \n
359 |     See https://developer.twitter.com/en/docs/tweets/enrichments/overview/klout \n
360 |     
361 |     Get the user's chosen Klout topics (a list of dicts), if it exists.
362 |     Regardless of format or topic type, topic dicts will have the same keys:
363 |     "url", "id", "name", "score"
364 | 
365 |     Args:
366 |         tweet (Tweet): A Tweet object
367 |         topic_type (str): Which type of Klout topic to return.
368 |                           Options are limited to 'influence' and 'interest'
369 | 
370 |     Returns:
371 |         list: A list of dicts representing Klout topics, or if Klout topics \
372 |         do not exist in the Tweet payload, return None. The list is sorted by
373 |         the "score" value.
374 | 
375 |     Example:
376 |         >>> result = [{
377 |         ...     # the user's score for that topic
378 |         ...     "score": 0.54,
379 |         ...     # the Klout topic ID
380 |         ...     "id": "10000000000000019376",
381 |         ...     # the Klout topic URL
382 |         ...     "url": "http://klout.com/topic/id/10000000000000019376",
383 |         ...     # the Klout topic name
384 |         ...     "name": "Emoji"
385 |         ... },
386 |         ... {
387 |         ... "score": 0.43,
388 |         ... "id": "9159",
389 |         ... "url": "http://klout.com/topic/id/9159",
390 |         ... "name": "Vegetables"
391 |         ... }]
392 |     """
393 |     try:
394 |         # check that the dict paths exist
395 |         if is_original_format(tweet):
396 |             topics = tweet['user']['derived']['klout']['{}_topics'.format(topic_type)]
397 |         else:
398 |             topics = tweet['gnip']['klout_profile']['topics']
399 |     except KeyError:
400 |         return None
401 |     # since we have topics, collect the right pieces
402 |     topics_list = []
403 |     if is_original_format(tweet):
404 |         for topic in topics:
405 |             # note: this is the same as the current structure of OF
406 |             # payloads, but is written out for consistency w/ AS payloads
407 |             this_topic = dict(url=topic['url'],
408 |                               id=topic['id'],
409 |                               name=topic['name'],
410 |                               score=topic['score'])
411 |             topics_list.append(this_topic)
412 |     else:
413 |         relevant_topics = [x for x in topics if x['topic_type'] == topic_type]
414 |         for topic in relevant_topics:
415 |             this_topic = dict(url=topic['link'],
416 |                               id=topic['id'],
417 |                               name=topic['displayName'],
418 |                               score=topic['score'])
419 |             topics_list.append(this_topic)
420 |     sorted_topics_list = sorted(topics_list, key=lambda x: x['score'])
421 |     return sorted_topics_list
422 | 


--------------------------------------------------------------------------------
/tweet_parser/lazy_property.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright 2018 Twitter, Inc.
 3 | # Licensed under the MIT License
 4 | # https://opensource.org/licenses/MIT
 5 | """Module to define a lazy property decorator that allows
 6 | attributes to be generated dynamically and cached after creation.
 7 | Original idea found via
 8 | http://stevenloria.com/lazy-evaluated-properties-in-python/
 9 | and lightly modified to preserve underlying docstrings.
10 | """
11 | from functools import wraps
12 | 
13 | def lazy_property(fn):
14 |     """
15 |     Decorator that makes a property lazy-evaluated whilst preserving
16 |     docstrings.
17 | 
18 |     Args:
19 |         fn (function): the property in question
20 | 
21 |     Returns:
22 |         evaluated version of the property.
23 |     """
24 |     attr_name = '_lazy_' + fn.__name__
25 | 
26 |     @property
27 |     @wraps(fn)
28 |     def _lazy_property(self):
29 |         if not hasattr(self, attr_name):
30 |             setattr(self, attr_name, fn(self))
31 |         return getattr(self, attr_name)
32 |     return _lazy_property
33 | 


--------------------------------------------------------------------------------
/tweet_parser/tweet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright 2018 Twitter, Inc.
  3 | # Licensed under the MIT License
  4 | # https://opensource.org/licenses/MIT
  5 | import datetime
  6 | 
  7 | from tweet_parser.lazy_property import lazy_property
  8 | from tweet_parser.tweet_parser_errors import NotATweetError
  9 | from tweet_parser import tweet_checking
 10 | from tweet_parser.getter_methods import tweet_date, tweet_user, tweet_counts
 11 | from tweet_parser.getter_methods import tweet_text, tweet_geo, tweet_links
 12 | from tweet_parser.getter_methods import tweet_entities, tweet_embeds
 13 | from tweet_parser.getter_methods import gnip_fields, tweet_generator, tweet_reply
 14 | 
 15 | 
 16 | class Tweet(dict):
 17 |     """
 18 |     Tweet object created from a dictionary representing a Tweet paylaod
 19 | 
 20 |     Args:
 21 |         tweet_dict (dict): A dictionary representing a Tweet payload
 22 |         do_format_checking (bool): If "True", compare the keys in this \
 23 |         dict to a supeset of expected keys and to a minimum set of expected \
 24 |         keys (as defined in tweet_parser.tweet_keys). \
 25 |         Will cause the parser to fail if unexpected keys are present \
 26 |         or if expected keys are missing. \
 27 |         Intended to allow run-time format testing, allowing the user \
 28 |         to surface unexpected format changes.
 29 | 
 30 |     Returns:
 31 |         Tweet: Class "Tweet", inherits from dict, provides properties to
 32 |         get various data values from the Tweet.
 33 | 
 34 |     Raises:
 35 |         NotATweetError: the Tweet dict is malformed, \
 36 |         see `tweet_checking.check_tweet` for details
 37 | 
 38 |     Example:
 39 |         >>> from tweet_parser.tweet import Tweet
 40 |         >>> # python dict representing a Tweet
 41 |         >>> tweet_dict = {"id": 867474613139156993,
 42 |         ...               "id_str": "867474613139156993",
 43 |         ...               "created_at": "Wed May 24 20:17:19 +0000 2017",
 44 |         ...               "text": "Some Tweet text",
 45 |         ...               "user": {
 46 |         ...                   "screen_name": "RobotPrincessFi",
 47 |         ...                   "id_str": "815279070241955840"
 48 |         ...                   }
 49 |         ...              }
 50 |         >>> # create a Tweet object
 51 |         >>> tweet = Tweet(tweet_dict)
 52 |         >>> # use the Tweet obj to access data elements
 53 |         >>> tweet.id
 54 |         '867474613139156993'
 55 |         >>> tweet.created_at_seconds
 56 |         1495657039
 57 |     """
 58 |     def __init__(self, tweet_dict, do_format_validation=False):
 59 |         """
 60 |         Initialize a Tweet object from a dict representing a Tweet payload
 61 |         """
 62 | 
 63 |         # get the format of the Tweet data
 64 |         # also, this throws an error if it's not a tweet
 65 |         self.original_format = tweet_checking.check_tweet(tweet_dict,
 66 |                                                           do_format_validation)
 67 | 
 68 |         # make sure that this obj has all of the keys that our dict had
 69 |         self.update(tweet_dict)
 70 | 
 71 |     @lazy_property
 72 |     def id(self):
 73 |         """
 74 |         Tweet snowflake id as a string
 75 | 
 76 |         Returns:
 77 |             str: Twitter snowflake id, numeric only (no other text)
 78 | 
 79 |         Example:
 80 |             >>> from tweet_parser.tweet import Tweet
 81 |             >>> original_format_dict = {
 82 |             ...     "created_at": "Wed May 24 20:17:19 +0000 2017",
 83 |             ...     "id": 867474613139156993,
 84 |             ...     "id_str": "867474613139156993",
 85 |             ...     "user": {"user_keys":"user_data"},
 86 |             ...     "text": "some tweet text"
 87 |             ...     }
 88 |             >>> Tweet(original_format_dict).id
 89 |             '867474613139156993'
 90 | 
 91 |             >>> activity_streams_dict = {
 92 |             ...     "postedTime": "2017-05-24T20:17:19.000Z",
 93 |             ...     "id": "tag:search.twitter.com,2005:867474613139156993",
 94 |             ...     "actor": {"user_keys":"user_data"},
 95 |             ...     "body": "some tweet text"
 96 |             ...     }
 97 |             >>> Tweet(activity_streams_dict).id
 98 |             '867474613139156993'
 99 |         """
100 |         if self.original_format:
101 |             return self["id_str"]
102 |         else:
103 |             return self["id"].split(":")[-1]
104 | 
105 |     @lazy_property
106 |     def created_at_seconds(self):
107 |         """
108 |         Time that a Tweet was posted in seconds since the Unix epoch
109 | 
110 |         Returns:
111 |             int: seconds since the unix epoch
112 |             (determined by converting Tweet.id
113 |             into a timestamp using `tweet_date.snowflake2utc`)
114 |         """
115 |         return tweet_date.snowflake2utc(self.id)
116 | 
117 |     @lazy_property
118 |     def created_at_datetime(self):
119 |         """
120 |         Time that a Tweet was posted as a Python datetime object
121 | 
122 |         Returns:
123 |             datetime.datetime: the value of `tweet.created_at_seconds`
124 |             converted into a datetime object
125 |         """
126 |         return datetime.datetime.utcfromtimestamp(self.created_at_seconds)
127 | 
128 |     @lazy_property
129 |     def created_at_string(self):
130 |         """
131 |         Time that a Tweet was posted as a string with the format
132 |         YYYY-mm-ddTHH:MM:SS.000Z
133 | 
134 |         Returns:
135 |             str: the value of `tweet.created_at_seconds`
136 |             converted into a string (YYYY-mm-ddTHH:MM:SS.000Z)
137 |         """
138 |         return self.created_at_datetime.strftime("%Y-%m-%dT%H:%M:%S.000Z")
139 | 
140 |     @lazy_property
141 |     def user_id(self):
142 |         """
143 |         The Twitter ID of the user who posted the Tweet
144 | 
145 |         Returns:
146 |             str: value returned by calling `tweet_user.get_user_id` on `self`
147 |         """
148 |         return tweet_user.get_user_id(self)
149 | 
150 |     @lazy_property
151 |     def screen_name(self):
152 |         """
153 |         The screen name (@ handle) of the user who posted the Tweet
154 | 
155 |         Returns:
156 |             str: value returned by calling `tweet_user.get_screen_name` on `self`
157 |         """
158 |         return tweet_user.get_screen_name(self)
159 | 
160 |     @lazy_property
161 |     def name(self):
162 |         """
163 |         The display name of the user who posted the Tweet
164 | 
165 |         Returns:
166 |             str: value returned by calling `tweet_user.get_name` on `self`
167 |         """
168 |         return tweet_user.get_name(self)
169 | 
170 |     @lazy_property
171 |     def bio(self):
172 |         """
173 |         The bio text of the user who posted the Tweet
174 | 
175 |         Returns:
176 |             str: the user's bio text.
177 |             value returned by calling `tweet_user.get_bio` on `self`
178 |         """
179 |         return tweet_user.get_bio(self)
180 | 
181 |     @lazy_property
182 |     def follower_count(self):
183 |         """
184 |         The number of followers that the author of the Tweet has
185 | 
186 |         Returns:
187 |             int: the number of followers.
188 |             value returned by calling `get_follower_count` on `self`
189 |         """
190 |         return tweet_user.get_follower_count(self)
191 | 
192 |     @lazy_property
193 |     def following_count(self):
194 |         """
195 |         The number of accounts that the author of the Tweet is following
196 | 
197 |         Returns:
198 |             int: the number of accounts that the author of the Tweet is following,
199 |             value returned by calling `get_following_count` on `self`
200 |         """
201 |         return tweet_user.get_following_count(self)
202 | 
203 |     @lazy_property
204 |     def klout_score(self):
205 |         """
206 |         (DEPRECATED): 
207 |         The Klout score (int) (if it exists) of the user who posted the Tweet
208 | 
209 |         Returns:
210 |             int: value returned by calling `tweet_user.get_klout_score` on `self`
211 |             (if no Klout is present, this returns a None)
212 |         """
213 |         return tweet_user.get_klout_score(self)
214 | 
215 |     @lazy_property
216 |     def klout_profile(self):
217 |         """
218 |         (DEPRECATED): 
219 |         The Klout profile URL of the user (`str`) (if it exists)
220 | 
221 |         Returns:
222 |             str: value returned by calling `tweet_user.get_klout_profile` on `self`
223 |             (if no Klout is present, this returns a `None`)
224 |         """
225 |         return tweet_user.get_klout_profile(self)
226 | 
227 |     @lazy_property
228 |     def klout_id(self):
229 |         """
230 |         (DEPRECATED): 
231 |         The Klout ID of the user (`str`) (if it exists)
232 | 
233 |         Returns:
234 |             str: value returned by calling `tweet_user.get_klout_id` on `self`
235 |             (if no Klout is present, this returns a `None`)
236 |         """
237 |         return tweet_user.get_klout_id(self)
238 | 
239 |     @lazy_property
240 |     def klout_influence_topics(self):
241 |         """
242 |         (DEPRECATED): 
243 |         Get the user's Klout influence topics (a list of dicts), if it exists.
244 |         Topic dicts will have these keys: `url`, `id`, `name`, `score`
245 | 
246 |         Returns:
247 |             list: value returned by calling
248 |             `tweet_user.get_klout_topics(self, topic_type = 'influence')`
249 |             (if no Klout is present, this returns a `None`)
250 |         """
251 |         return tweet_user.get_klout_topics(self, topic_type='influence')
252 | 
253 |     @lazy_property
254 |     def klout_interest_topics(self):
255 |         """
256 |         (DEPRECATED): 
257 |         Get the user's Klout interest topics (a list of dicts), if it exists.
258 |         Topic dicts will have these keys: `url`, `id`, `name`, `score`
259 | 
260 |         Returns:
261 |             list: value returned by calling
262 |             `tweet_user.get_klout_topics(self, topic_type = 'interest')`
263 |             (if no Klout is present, this returns a `None`)
264 |         """
265 |         return tweet_user.get_klout_topics(self, topic_type='interest')
266 | 
267 |     @lazy_property
268 |     def text(self):
269 |         """
270 |         The contents of "text" (original format)
271 |         or "body" (activity streams format)
272 | 
273 |         Returns:
274 |             str: value returned by calling `tweet_text.get_text` on `self`
275 |         """
276 |         return tweet_text.get_text(self)
277 | 
278 |     @lazy_property
279 |     def tweet_type(self):
280 |         """
281 |         The type of Tweet this is (3 options: tweet, quote, and retweet)
282 | 
283 |         Returns:
284 |             str: ("tweet","quote" or "retweet" only)
285 |             value returned by calling `tweet_text.get_tweet_type` on `self`
286 |         """
287 |         return tweet_text.get_tweet_type(self)
288 | 
289 |     @lazy_property
290 |     def user_entered_text(self):
291 |         """
292 |         The text that the posting user entered \n
293 |         *tweet*: untruncated (includes @-mention replies and long links)
294 |         text of an original Tweet \n
295 |         *quote tweet*: untruncated poster-added content in a quote-tweet \n
296 |         *retweet*: empty string
297 | 
298 |         Returns:
299 |             str: if `tweet.tweet_type == "retweet"`, returns an empty string
300 |             else, returns the value of `tweet_text.get_full_text(self)`
301 |         """
302 |         if self.tweet_type == "retweet":
303 |             return ""
304 |         return tweet_text.get_full_text(self)
305 | 
306 |     @lazy_property
307 |     def lang(self):
308 |         """
309 |         The language that the Tweet is written in.
310 | 
311 |         Returns:
312 |             str: 2-letter BCP 47 language code (or None if undefined)
313 |             Value returned by calling `tweet_text.get_lang` on `self`
314 |         """
315 |         return tweet_text.get_lang(self)
316 | 
317 |     @lazy_property
318 |     def poll_options(self):
319 |         """
320 |         The text in the options of a poll as a list. \
321 |         If there is no poll in the Tweet, return an empty list. \
322 |         If activity-streams format, raise `NotAvailableError`
323 | 
324 |         Returns:
325 |             list (list of strings): value returned by calling
326 |             `tweet_text.get_poll_options` on `self`
327 |         """
328 |         return tweet_text.get_poll_options(self)
329 | 
330 |     @lazy_property
331 |     def quote_or_rt_text(self):
332 |         """
333 |         The quoted or retweeted text in a Tweet
334 |         (this is not the text entered by the posting user) \n
335 |         - tweet: empty string (there is no quoted or retweeted text) \n
336 |         - quote: only the text of the quoted Tweet \n
337 |         - retweet: the text of the retweet
338 | 
339 |         Returns:
340 |             str: value returned by calling
341 |             tweet_text.get_quote_or_rt_text on `self`
342 |         """
343 |         return tweet_text.get_quote_or_rt_text(self)
344 | 
345 |     @lazy_property
346 |     def all_text(self):
347 |         """
348 |         All of the text of the tweet. This includes @ mentions, long links,
349 |         quote-tweet contents (separated by a newline), RT contents
350 |         & poll options
351 | 
352 |         Returns:
353 |             str: value returned by calling `tweet_text.get_all_text` on `self`
354 |         """
355 |         return tweet_text.get_all_text(self)
356 | 
357 |     @lazy_property
358 |     def geo_coordinates(self):
359 |         """
360 |         The user's geo coordinates, if they are included in the payload
361 |         (otherwise return `None`).
362 |         Dictionary with the keys "latitude" and "longitude" or `None`
363 | 
364 |         Returns:
365 |             dict: value returned by calling `tweet_geo.get_geo_coordinates` on `self`
366 |         """
367 |         return tweet_geo.get_geo_coordinates(self)
368 | 
369 |     @lazy_property
370 |     def profile_location(self):
371 |         """
372 |         User's derived location data from the profile location enrichment
373 |         If unavailable, returns `None`.
374 | 
375 |         Returns:
376 |             dict: value returned by calling tweet_geo.get_profile_location on `self`
377 | 
378 |         Example:
379 |             >>> result = {"country": "US",         # Two letter ISO-3166 country code
380 |             ...           "locality": "Boulder",   # The locality location (~ city)
381 |             ...           "region": "Colorado",    # The region location (~ state/province)
382 |             ...           "sub_region": "Boulder", # The sub-region location (~ county)
383 |             ...           "full_name": "Boulder, Colorado, US", # The full name (excluding sub-region)
384 |             ...           "geo":  [40,-105]        # lat/long value that coordinate that corresponds to
385 |             ...                                     # the lowest granularity location for where the user
386 |             ...                                     # who created the Tweet is from
387 |             ... }
388 |         """
389 |         return tweet_geo.get_profile_location(self)
390 | 
391 |     @lazy_property
392 |     def tweet_links(self):
393 |         """
394 |         The links that are included in the Tweet as "urls"
395 |         (if there are no links, this is an empty list)
396 |         This includes links that are included in quoted or retweeted Tweets
397 |         Returns unrolled or expanded_url information if it is available
398 | 
399 |         Returns:
400 |             list (list of dicts): A list of dictionaries containing information
401 |             about urls. Each dictionary entity can have these keys; without
402 |             unwound url or expanded url Twitter data enrichments many of these
403 |             fields will be missing.
404 |             (value returned by calling tweet_links.get_tweet_links on `self`)
405 | 
406 |         Example:
407 |             >>> result = [
408 |             ...   {
409 |             ...   # url that shows up in the tweet text
410 |             ...   'display_url': "https://twitter.com/RobotPrinc...",
411 |             ...   # long (expanded) url
412 |             ...   'expanded_url': "https://twitter.com/RobotPrincessFi",
413 |             ...   # characters where the display link is
414 |             ...   'indices': [55, 88],
415 |             ...   'unwound': {
416 |             ...      # description from the linked webpage
417 |             ...      'description': "the Twitter profile of RobotPrincessFi",
418 |             ...      'status': 200,
419 |             ...      # title of the webpage
420 |             ...      'title': "the Twitter profile of RobotPrincessFi",
421 |             ...      # long (expanded) url}
422 |             ...      'url': "https://twitter.com/RobotPrincessFi"},
423 |             ...   # the url that tweet directs to, often t.co
424 |             ...   'url': "t.co/1234"}]
425 |         """
426 |         return tweet_links.get_tweet_links(self)
427 | 
428 |     @lazy_property
429 |     def most_unrolled_urls(self):
430 |         """
431 |         For each url included in the Tweet "urls", get the most unrolled
432 |         version available. Only return 1 url string per url in tweet.tweet_links
433 |         In order of preference for "most unrolled"
434 |         (keys from the dict at tweet.tweet_links): \n
435 |         1. `unwound`/`url` \n
436 |         2. `expanded_url` \n
437 |         3. `url`
438 | 
439 |         Returns:
440 |             list (a list of strings): list of urls
441 |             value returned by calling tweet_links.get_most_unrolled_urls on `self`
442 |         """
443 |         return tweet_links.get_most_unrolled_urls(self)
444 | 
445 |     @lazy_property
446 |     def user_mentions(self):
447 |         """
448 |         The @-mentions in the Tweet as dictionaries.
449 |         Note that in the case of a quote-tweet, this does not return the users
450 |         mentioned in the quoted status. The recommended way to get that list
451 |         would be to use 'tweet.quoted_tweet.user_mentions'.
452 |         Also note that in the caes of a quote-tweet, the list of @-mentioned
453 |         users does not include the user who authored the original (quoted) Tweet,
454 |         you can get the author of the quoted tweet using
455 |         `tweet.quoted_tweet.user_id`
456 | 
457 |         Returns:
458 |             list (list of dicts): 1 item per @ mention,
459 |             value returned by calling `tweet_entities.get_user_mentions` on `self`
460 | 
461 |         Example:
462 |             >>> result = {
463 |             ...   #characters where the @ mention appears
464 |             ...   "indices": [14,26],
465 |             ...   #id of @ mentioned user as a string
466 |             ...   "id_str": "2382763597",
467 |             ...   #screen_name of @ mentioned user
468 |             ...   "screen_name": "notFromShrek",
469 |             ...   #display name of @ mentioned user
470 |             ...   "name": "Fiona",
471 |             ...   #id of @ mentioned user as an int
472 |             ...   "id": 2382763597
473 |             ... }
474 | 
475 |         """
476 |         return tweet_entities.get_user_mentions(self)
477 | 
478 |     @lazy_property
479 |     def hashtags(self):
480 |         """
481 |         A list of hashtags in the Tweet.
482 |         Note that in the case of a quote-tweet, this does not return the
483 |         hashtags in the quoted status. The recommended way to get that list
484 |         would be to use `tweet.quoted_tweet.hashtags`
485 | 
486 |         Returns:
487 |             list (a list of strings): list of all of the hashtags in the Tweet
488 |             value returned by calling `tweet_entities.get_hashtags` on `self`
489 |         """
490 |         return tweet_entities.get_hashtags(self)
491 | 
492 |     @lazy_property
493 |     def media_urls(self):
494 |         """
495 |         A list of all media (https) urls in the tweet, useful for grabbing
496 |         photo/video urls for other purposes.
497 | 
498 |         Returns:
499 |             list (a list of strings): list of all of the media urls in the Tweet
500 |             value returned by calling `tweet_entities.get_media_urls` on `self`
501 |         """
502 |         return tweet_entities.get_media_urls(self)
503 | 
504 |     @lazy_property
505 |     def quoted_tweet(self):
506 |         """
507 |         The quoted Tweet as a Tweet object
508 |         If the Tweet is not a quote Tweet, return None
509 |         If the quoted Tweet payload cannot be loaded as a Tweet, this will
510 |         raise a "NotATweetError"
511 | 
512 |         Returns:
513 |             Tweet: A Tweet representing the quoted status (or None)
514 |             (see tweet_embeds.get_quote_tweet, this is that value as a Tweet)
515 | 
516 |         Raises:
517 |             NotATweetError: if quoted tweet is malformed
518 |         """
519 |         quote_tweet = tweet_embeds.get_quoted_tweet(self)
520 |         if quote_tweet is not None:
521 |             try:
522 |                 return Tweet(quote_tweet)
523 |             except NotATweetError as nate:
524 |                 raise(NotATweetError("The quote-tweet payload appears malformed." +
525 |                                      " Failed with '{}'".format(nate)))
526 |         else:
527 |             return None
528 | 
529 |     @lazy_property
530 |     def retweeted_tweet(self):
531 |         """
532 |         The retweeted Tweet as a Tweet object
533 |         If the Tweet is not a Retweet, return None
534 |         If the Retweet payload cannot be loaded as a Tweet, this will
535 |         raise a `NotATweetError`
536 | 
537 |         Returns:
538 |             Tweet: A Tweet representing the retweeted status (or None)
539 |             (see tweet_embeds.get_retweet, this is that value as a Tweet)
540 | 
541 |         Raises:
542 |             NotATweetError: if retweeted tweet is malformed
543 |         """
544 |         retweet = tweet_embeds.get_retweeted_tweet(self)
545 |         if retweet is not None:
546 |             try:
547 |                 return Tweet(retweet)
548 |             except NotATweetError as nate:
549 |                 raise(NotATweetError("The retweet payload appears malformed." +
550 |                                      " Failed with '{}'".format(nate)))
551 |         else:
552 |             return None
553 | 
554 |     @lazy_property
555 |     def embedded_tweet(self):
556 |         """
557 |         Get the retweeted Tweet OR the quoted Tweet and return it as a Tweet object
558 | 
559 |         Returns:
560 |             Tweet (or None, if the Tweet is neither a quote tweet or a Retweet):
561 |             a Tweet representing the quote Tweet or the Retweet
562 |             (see tweet_embeds.get_embedded_tweet, this is that value as a Tweet)
563 | 
564 |         Raises:
565 |             NotATweetError: if embedded tweet is malformed
566 |         """
567 |         embedded_tweet = tweet_embeds.get_embedded_tweet(self)
568 |         if embedded_tweet is not None:
569 |             try:
570 |                 return Tweet(embedded_tweet)
571 |             except NotATweetError as nate:
572 |                 raise(NotATweetError("The embedded tweet payload {} appears malformed." +
573 |                                      " Failed with '{}'".format(embedded_tweet, nate)))
574 |         else:
575 |             return None
576 | 
577 |     @lazy_property
578 |     def gnip_matching_rules(self):
579 |         """
580 |         Get the Gnip tagged rules that this tweet matched.
581 | 
582 |         Returns:
583 |             List of potential tags with the matching rule or None if no rules
584 |             are defined.
585 | 
586 |         """
587 |         return gnip_fields.get_matching_rules(self)
588 | 
589 |     @lazy_property
590 |     def generator(self):
591 |         """
592 |         Get information about the application that generated the Tweet
593 | 
594 |         Returns:
595 |             dict: keys are 'link' and 'name', the link to and name of the application
596 |             that generated the Tweet.
597 |             value returned by calling `tweet_generator.get_generator` on `self`
598 |         """
599 |         return tweet_generator.get_generator(self)
600 | 
601 |     @lazy_property
602 |     def in_reply_to_screen_name(self):
603 |         """
604 |         The screen name of the user being replied to (None if the Tweet isn't a reply)
605 | 
606 |         Returns:
607 |             str: value returned by calling `tweet_reply.get_in_reply_to_screen_name` on `self`
608 |         """
609 |         return tweet_reply.get_in_reply_to_screen_name(self)
610 | 
611 |     @lazy_property
612 |     def in_reply_to_user_id(self):
613 |         """
614 |         The user id of the user being replied to (None if the Tweet isn't a reply).
615 |         This raises a NotAvailableError for activity-streams format
616 | 
617 |         Returns:
618 |             str: value returned by calling `tweet_reply.get_in_reply_to_user_id` on `self`
619 |         """
620 |         return tweet_reply.get_in_reply_to_user_id(self)
621 | 
622 |     @lazy_property
623 |     def in_reply_to_status_id(self):
624 |         """
625 |         The status id of the Tweet being replied to (None if the Tweet isn't a reply)
626 | 
627 |         Returns:
628 |             str: value returned by calling `tweet_reply.get_in_reply_to_status_id` on `self`
629 |         """
630 |         return tweet_reply.get_in_reply_to_status_id(self)
631 | 
632 |     @lazy_property
633 |     def favorite_count(self):
634 |         """
635 |         The number of favorites that this tweet has received *at the time of
636 |         retrieval*. If a tweet is obtained from a live stream, this will likely
637 |         be 0.
638 | 
639 |         Returns:
640 |             int: value returned by calling `tweet_counts.get_favorite_count` on `self`
641 |         """
642 |         return tweet_counts.get_favorite_count(self)
643 | 
644 |     @lazy_property
645 |     def quote_count(self):
646 |         """
647 |         The number of tweets that this tweet has been quoted in *at the time of
648 |         retrieval*. If a tweet is obtained from a live stream, this will likely
649 |         be 0.
650 |         This raises a NotAvailableError for activity-streams format
651 |         
652 |         Returns:
653 |             int: value returned by calling `tweet_counts.get_quote_count` on `self` 
654 |             or raises NotAvailableError
655 |         """
656 |         return tweet_counts.get_quote_count(self)
657 | 
658 |     @lazy_property
659 |     def retweet_count(self):
660 |         """
661 |         The number of times this tweet has been retweeted *at the time of
662 |         retrieval*. If a tweet is obtained from a live stream, this will likely
663 |         be 0.
664 |         
665 |         Returns:
666 |             int: value returned by calling `tweet_counts.get_retweet_count` on `self` 
667 |         """
668 |         return tweet_counts.get_retweet_count(self)
669 | 


--------------------------------------------------------------------------------
/tweet_parser/tweet_checking.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright 2018 Twitter, Inc.
  3 | # Licensed under the MIT License
  4 | # https://opensource.org/licenses/MIT
  5 | """Validation and checking methods for Tweets.
  6 | 
  7 | Methods here are primarily used by other methods within this module but can be
  8 | used for other validation code as well.
  9 | """
 10 | 
 11 | from tweet_parser.tweet_parser_errors import NotATweetError, UnexpectedFormatError
 12 | from tweet_parser.tweet_keys import original_format_minimum_set_keys
 13 | from tweet_parser.tweet_keys import activity_streams_minimum_set_keys
 14 | from tweet_parser.tweet_keys import original_format_superset_keys, activity_streams_superset_keys
 15 | 
 16 | 
 17 | def is_original_format(tweet):
 18 |     """
 19 |     Simple checker to flag the format of a tweet.
 20 | 
 21 |     Args:
 22 |         tweet (Tweet): tweet in qustion
 23 | 
 24 |     Returns:
 25 |         Bool
 26 | 
 27 |     Example:
 28 |         >>> import tweet_parser.tweet_checking as tc
 29 |         >>> tweet = {"created_at": 124125125125,
 30 |         ...          "text": "just setting up my twttr",
 31 |         ...          "nested_field": {"nested_1": "field", "nested_2": "field2"}}
 32 |         >>> tc.is_original_format(tweet)
 33 |         True
 34 |     """
 35 |     # deleted due to excess checking; it's a key lookup and does not need any
 36 |     # operational optimization
 37 |     if "created_at" in tweet:
 38 |         original_format = True
 39 |     elif "postedTime" in tweet:
 40 |         original_format = False
 41 |     else:
 42 |         raise NotATweetError("This dict has neither 'created_at' or 'postedTime' as keys")
 43 |     return original_format
 44 | 
 45 | 
 46 | def get_all_keys(tweet, parent_key=''):
 47 |     """
 48 |     Takes a tweet object and recursively returns a list of all keys contained
 49 |     in this level and all nexstted levels of the tweet.
 50 | 
 51 |     Args:
 52 |         tweet (Tweet): the tweet dict
 53 |         parent_key (str): key from which this process will start, e.g., you can
 54 |                           get keys only under some key that is not the top-level key.
 55 | 
 56 |     Returns:
 57 |         list of all keys in nested dicts.
 58 | 
 59 |     Example:
 60 |         >>> import tweet_parser.tweet_checking as tc
 61 |         >>> tweet = {"created_at": 124125125125, "text": "just setting up my twttr",
 62 |         ...          "nested_field": {"nested_1": "field", "nested_2": "field2"}}
 63 |         >>> tc.get_all_keys(tweet)
 64 |         ['created_at', 'text', 'nested_field nested_1', 'nested_field nested_2']
 65 |     """
 66 |     items = []
 67 |     for k, v in tweet.items():
 68 |         new_key = parent_key + " " + k
 69 |         if isinstance(v, dict):
 70 |             items.extend(get_all_keys(v, parent_key=new_key))
 71 |         else:
 72 |             items.append(new_key.strip(" "))
 73 |     return items
 74 | 
 75 | 
 76 | def key_validation_check(tweet_keys_list, superset_keys, minset_keys):
 77 |     """
 78 |     Validates the keys present in a Tweet.
 79 | 
 80 |     Args:
 81 |         tweet_keys_list (list): the keys present in a tweet
 82 |         superset_keys (set): the set of all possible keys for a tweet
 83 |         minset_keys (set): the set of minimal keys expected in a tweet.
 84 | 
 85 |     Returns:
 86 |         0 if no errors
 87 | 
 88 |     Raises:
 89 |         UnexpectedFormatError on any mismatch of keys.
 90 |     """
 91 |     # check for keys that must be present
 92 |     tweet_keys = set(tweet_keys_list)
 93 |     minset_overlap = tweet_keys & minset_keys
 94 |     if minset_overlap != minset_keys:
 95 |         raise UnexpectedFormatError("keys ({}) missing from Tweet (Public API data is not supported)"
 96 |                                     .format(minset_keys - tweet_keys))
 97 |     # check for keys that could be present
 98 |     unexpected_keys = tweet_keys - superset_keys
 99 |     if len(unexpected_keys) > 0:
100 |         raise UnexpectedFormatError("Unexpected keys ({}) are in this Tweet"
101 |                                     .format(unexpected_keys))
102 |     return 0
103 | 
104 | 
105 | 
106 | def _check_original_format_tweet(tweet, validation_checking=False):
107 |     for key in ["user", "text"]:
108 |         if key not in tweet:
109 |             raise NotATweetError("This dict has no '{}' key".format(key))
110 |     # check for changing keys
111 |     if validation_checking:
112 |         _ = key_validation_check(get_all_keys(tweet),
113 |                                  original_format_superset_keys,
114 |                                  original_format_minimum_set_keys)
115 | 
116 | 
117 | def _check_activity_streams_tweet(tweet, validation_checking=False):
118 |     for key in ["actor", "body"]:
119 |         if key not in tweet:
120 |             raise NotATweetError("This dict has no '{}' key".format(key))
121 |     # check for changing keys
122 |     if validation_checking:
123 |         _ = key_validation_check(get_all_keys(tweet),
124 |                                  activity_streams_superset_keys,
125 |                                  activity_streams_minimum_set_keys)
126 | 
127 | 
128 | 
129 | def check_tweet(tweet, validation_checking=False):
130 |     """
131 |     Ensures a tweet is valid and determines the type of format for the tweet.
132 | 
133 |     Args:
134 |         tweet (dict/Tweet): the tweet payload
135 |         validation_checking (bool): check for valid key structure in a tweet.
136 |     """
137 | 
138 |     if "id" not in tweet:
139 |         raise NotATweetError("This text has no 'id' key")
140 | 
141 |     original_format = is_original_format(tweet)
142 | 
143 |     if original_format:
144 |         _check_original_format_tweet(tweet, validation_checking=validation_checking)
145 |     else:
146 |         _check_activity_streams_tweet(tweet, validation_checking=validation_checking)
147 | 
148 |     return original_format
149 | 


--------------------------------------------------------------------------------
/tweet_parser/tweet_keys.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright 2018 Twitter, Inc.
  3 | # Licensed under the MIT License
  4 | # https://opensource.org/licenses/MIT
  5 | original_format_superset_keys = {
  6 |     'contributors',
  7 |     'coordinates',
  8 |     'created_at',
  9 |     'display_text_range',
 10 |     'entities hashtags',
 11 |     'entities media',
 12 |     'entities polls',
 13 |     'entities symbols',
 14 |     'entities urls',
 15 |     'entities user_mentions',
 16 |     'extended_entities media',
 17 |     'extended_tweet display_text_range',
 18 |     'extended_tweet entities hashtags',
 19 |     'extended_tweet entities media',
 20 |     'extended_tweet entities symbols',
 21 |     'extended_tweet entities urls',
 22 |     'extended_tweet entities user_mentions',
 23 |     'extended_tweet extended_entities media',
 24 |     'extended_tweet full_text',
 25 |     'favorite_count',
 26 |     'favorited',
 27 |     'filter_level',
 28 |     'geo',
 29 |     'id',
 30 |     'id_str',
 31 |     'in_reply_to_screen_name',
 32 |     'in_reply_to_status_id',
 33 |     'in_reply_to_status_id_str',
 34 |     'in_reply_to_user_id',
 35 |     'in_reply_to_user_id_str',
 36 |     'is_quote_status',
 37 |     'lang',
 38 |     'matching_rules',
 39 |     'place',
 40 |     'place bounding_box coordinates',
 41 |     'place bounding_box type',
 42 |     'place country',
 43 |     'place country_code',
 44 |     'place full_name',
 45 |     'place id',
 46 |     'place name',
 47 |     'place place_type',
 48 |     'place url',
 49 |     'possibly_sensitive',
 50 |     'quote_count',
 51 |     'quoted_status contributors',
 52 |     'quoted_status coordinates',
 53 |     'quoted_status created_at',
 54 |     'quoted_status display_text_range',
 55 |     'quoted_status entities hashtags',
 56 |     'quoted_status entities symbols',
 57 |     'quoted_status entities urls',
 58 |     'quoted_status entities user_mentions',
 59 |     'quoted_status extended_tweet display_text_range',
 60 |     'quoted_status extended_tweet entities hashtags',
 61 |     'quoted_status extended_tweet entities media',
 62 |     'quoted_status extended_tweet entities symbols',
 63 |     'quoted_status extended_tweet entities urls',
 64 |     'quoted_status extended_tweet entities user_mentions',
 65 |     'quoted_status extended_tweet extended_entities media',
 66 |     'quoted_status extended_tweet full_text',
 67 |     'quoted_status favorite_count',
 68 |     'quoted_status favorited',
 69 |     'quoted_status filter_level',
 70 |     'quoted_status geo',
 71 |     'quoted_status id',
 72 |     'quoted_status id_str',
 73 |     'quoted_status in_reply_to_screen_name',
 74 |     'quoted_status in_reply_to_status_id',
 75 |     'quoted_status in_reply_to_status_id_str',
 76 |     'quoted_status in_reply_to_user_id',
 77 |     'quoted_status in_reply_to_user_id_str',
 78 |     'quoted_status is_quote_status',
 79 |     'quoted_status lang',
 80 |     'quoted_status place',
 81 |     'quoted_status place bounding_box coordinates',
 82 |     'quoted_status place bounding_box type',
 83 |     'quoted_status place country',
 84 |     'quoted_status place country_code',
 85 |     'quoted_status place full_name',
 86 |     'quoted_status place id',
 87 |     'quoted_status place name',
 88 |     'quoted_status place place_type',
 89 |     'quoted_status place url',
 90 |     'quoted_status possibly_sensitive',
 91 |     'quoted_status quote_count',
 92 |     'quoted_status quoted_status_id',
 93 |     'quoted_status quoted_status_id_str',
 94 |     'quoted_status reply_count',
 95 |     'quoted_status retweet_count',
 96 |     'quoted_status retweeted',
 97 |     'quoted_status source',
 98 |     'quoted_status text',
 99 |     'quoted_status truncated',
100 |     'quoted_status user contributors_enabled',
101 |     'quoted_status user created_at',
102 |     'quoted_status user default_profile',
103 |     'quoted_status user default_profile_image',
104 |     'quoted_status user description',
105 |     'quoted_status user favourites_count',
106 |     'quoted_status user follow_request_sent',
107 |     'quoted_status user followers_count',
108 |     'quoted_status user following',
109 |     'quoted_status user friends_count',
110 |     'quoted_status user geo_enabled',
111 |     'quoted_status user id',
112 |     'quoted_status user id_str',
113 |     'quoted_status user is_translator',
114 |     'quoted_status user lang',
115 |     'quoted_status user listed_count',
116 |     'quoted_status user location',
117 |     'quoted_status user name',
118 |     'quoted_status user notifications',
119 |     'quoted_status user profile_background_color',
120 |     'quoted_status user profile_background_image_url',
121 |     'quoted_status user profile_background_image_url_https',
122 |     'quoted_status user profile_background_tile',
123 |     'quoted_status user profile_banner_url',
124 |     'quoted_status user profile_image_url',
125 |     'quoted_status user profile_image_url_https',
126 |     'quoted_status user profile_link_color',
127 |     'quoted_status user profile_sidebar_border_color',
128 |     'quoted_status user profile_sidebar_fill_color',
129 |     'quoted_status user profile_text_color',
130 |     'quoted_status user profile_use_background_image',
131 |     'quoted_status user protected',
132 |     'quoted_status user screen_name',
133 |     'quoted_status user statuses_count',
134 |     'quoted_status user time_zone',
135 |     'quoted_status user translator_type',
136 |     'quoted_status user url',
137 |     'quoted_status user verified',
138 |     'quoted_status_id',
139 |     'quoted_status_id_str',
140 |     'reply_count',
141 |     'retweet_count',
142 |     'retweeted',
143 |     'retweeted_status contributors',
144 |     'retweeted_status coordinates',
145 |     'retweeted_status created_at',
146 |     'retweeted_status display_text_range',
147 |     'retweeted_status entities hashtags',
148 |     'retweeted_status entities symbols',
149 |     'retweeted_status entities urls',
150 |     'retweeted_status entities user_mentions',
151 |     'retweeted_status extended_tweet display_text_range',
152 |     'retweeted_status extended_tweet entities hashtags',
153 |     'retweeted_status extended_tweet entities media',
154 |     'retweeted_status extended_tweet entities symbols',
155 |     'retweeted_status extended_tweet entities urls',
156 |     'retweeted_status extended_tweet entities user_mentions',
157 |     'retweeted_status extended_tweet extended_entities media',
158 |     'retweeted_status extended_tweet full_text',
159 |     'retweeted_status favorite_count',
160 |     'retweeted_status favorited',
161 |     'retweeted_status filter_level',
162 |     'retweeted_status geo',
163 |     'retweeted_status id',
164 |     'retweeted_status id_str',
165 |     'retweeted_status in_reply_to_screen_name',
166 |     'retweeted_status in_reply_to_status_id',
167 |     'retweeted_status in_reply_to_status_id_str',
168 |     'retweeted_status in_reply_to_user_id',
169 |     'retweeted_status in_reply_to_user_id_str',
170 |     'retweeted_status is_quote_status',
171 |     'retweeted_status lang',
172 |     'retweeted_status place bounding_box coordinates',
173 |     'retweeted_status place bounding_box type',
174 |     'retweeted_status place country',
175 |     'retweeted_status place country_code',
176 |     'retweeted_status place full_name',
177 |     'retweeted_status place id',
178 |     'retweeted_status place name',
179 |     'retweeted_status place place_type',
180 |     'retweeted_status place url',
181 |     'retweeted_status possibly_sensitive',
182 |     'retweeted_status quote_count',
183 |     'retweeted_status reply_count',
184 |     'retweeted_status retweet_count',
185 |     'retweeted_status retweeted',
186 |     'retweeted_status source',
187 |     'retweeted_status text',
188 |     'retweeted_status truncated',
189 |     'retweeted_status user contributors_enabled',
190 |     'retweeted_status user created_at',
191 |     'retweeted_status user default_profile',
192 |     'retweeted_status user default_profile_image',
193 |     'retweeted_status user description',
194 |     'retweeted_status user favourites_count',
195 |     'retweeted_status user follow_request_sent',
196 |     'retweeted_status user followers_count',
197 |     'retweeted_status user following',
198 |     'retweeted_status user friends_count',
199 |     'retweeted_status user geo_enabled',
200 |     'retweeted_status user id',
201 |     'retweeted_status user id_str',
202 |     'retweeted_status user is_translator',
203 |     'retweeted_status user lang',
204 |     'retweeted_status user listed_count',
205 |     'retweeted_status user location',
206 |     'retweeted_status user name',
207 |     'retweeted_status user notifications',
208 |     'retweeted_status user profile_background_color',
209 |     'retweeted_status user profile_background_image_url',
210 |     'retweeted_status user profile_background_image_url_https',
211 |     'retweeted_status user profile_background_tile',
212 |     'retweeted_status user profile_banner_url',
213 |     'retweeted_status user profile_image_url',
214 |     'retweeted_status user profile_image_url_https',
215 |     'retweeted_status user profile_link_color',
216 |     'retweeted_status user profile_sidebar_border_color',
217 |     'retweeted_status user profile_sidebar_fill_color',
218 |     'retweeted_status user profile_text_color',
219 |     'retweeted_status user profile_use_background_image',
220 |     'retweeted_status user protected',
221 |     'retweeted_status user screen_name',
222 |     'retweeted_status user statuses_count',
223 |     'retweeted_status user time_zone',
224 |     'retweeted_status user translator_type',
225 |     'retweeted_status user url',
226 |     'retweeted_status user verified',
227 |     'source',
228 |     'text',
229 |     'truncated',
230 |     'user contributors_enabled',
231 |     'user created_at',
232 |     'user default_profile',
233 |     'user default_profile_image',
234 |     'user derived klout influence_topics',
235 |     'user derived klout interest_topics',
236 |     'user derived klout profile_url',
237 |     'user derived klout score',
238 |     'user derived klout user_id',
239 |     'user description',
240 |     'user favourites_count',
241 |     'user follow_request_sent',
242 |     'user followers_count',
243 |     'user following',
244 |     'user friends_count',
245 |     'user geo_enabled',
246 |     'user id',
247 |     'user id_str',
248 |     'user is_translator',
249 |     'user lang',
250 |     'user listed_count',
251 |     'user location',
252 |     'user name',
253 |     'user notifications',
254 |     'user profile_background_color',
255 |     'user profile_background_image_url',
256 |     'user profile_background_image_url_https',
257 |     'user profile_background_tile',
258 |     'user profile_image_url',
259 |     'user profile_image_url_https',
260 |     'user profile_link_color',
261 |     'user profile_sidebar_border_color',
262 |     'user profile_sidebar_fill_color',
263 |     'user profile_text_color',
264 |     'user profile_use_background_image',
265 |     'user protected',
266 |     'user screen_name',
267 |     'user statuses_count',
268 |     'user time_zone',
269 |     'user translator_type',
270 |     'user url',
271 |     'user utc_offset',
272 |     'user verified',
273 |     'scopes followers', #no idea what the hell this key is
274 |     'geo', 
275 |     'coordinates',
276 |     'retweeted_status place', 
277 |     'user profile_banner_url', 
278 |     'retweeted_status extended_entities media', 
279 |     'retweeted_status entities media', 
280 |     'user derived locations',
281 |     'retweeted_status quoted_status in_reply_to_status_id_str',
282 |     'retweeted_status quoted_status user default_profile',
283 |     'retweeted_status quoted_status user profile_background_color',
284 |     'retweeted_status quoted_status truncated',
285 |     'retweeted_status quoted_status user profile_background_image_url',
286 |     'retweeted_status quoted_status user followers_count',
287 |     'retweeted_status quoted_status user id',
288 |     'retweeted_status quoted_status_id',
289 |     'retweeted_status quoted_status lang',
290 |     'retweeted_status quoted_status in_reply_to_user_id',
291 |     'retweeted_status quoted_status user protected',
292 |     'retweeted_status quoted_status user profile_use_background_image',
293 |     'retweeted_status quoted_status user profile_image_url',
294 |     'retweeted_status quoted_status user is_translator',
295 |     'retweeted_status quoted_status text',
296 |     'retweeted_status quoted_status in_reply_to_status_id',
297 |     'retweeted_status quoted_status id',
298 |     'retweeted_status quoted_status user favourites_count',
299 |     'retweeted_status quoted_status coordinates',
300 |     'retweeted_status quoted_status user description',
301 |     'retweeted_status quoted_status source',
302 |     'retweeted_status quoted_status user profile_image_url_https',
303 |     'retweeted_status quoted_status contributors',
304 |     'retweeted_status quoted_status user follow_request_sent',
305 |     'retweeted_status quoted_status created_at',
306 |     'retweeted_status quoted_status entities user_mentions',
307 |     'retweeted_status quoted_status user profile_sidebar_fill_color',
308 |     'retweeted_status quoted_status quote_count',
309 |     'retweeted_status quoted_status user following',
310 |     'retweeted_status quoted_status user profile_banner_url',
311 |     'retweeted_status quoted_status reply_count',
312 |     'retweeted_status quoted_status user name',
313 |     'retweeted_status quoted_status user profile_background_image_url_https',
314 |     'retweeted_status quoted_status entities symbols',
315 |     'retweeted_status quoted_status retweet_count',
316 |     'retweeted_status quoted_status user id_str',
317 |     'retweeted_status quoted_status retweeted',
318 |     'retweeted_status quoted_status user created_at',
319 |     'retweeted_status quoted_status place',
320 |     'retweeted_status quoted_status user friends_count',
321 |     'retweeted_status quoted_status user location',
322 |     'retweeted_status quoted_status user listed_count',
323 |     'retweeted_status quoted_status is_quote_status',
324 |     'retweeted_status quoted_status in_reply_to_user_id_str',
325 |     'retweeted_status quoted_status_id_str',
326 |     'retweeted_status quoted_status user screen_name',
327 |     'retweeted_status quoted_status user profile_sidebar_border_color',
328 |     'retweeted_status quoted_status user default_profile_image',
329 |     'retweeted_status quoted_status user utc_offset',
330 |     'retweeted_status quoted_status favorited',
331 |     'retweeted_status quoted_status user verified',
332 |     'retweeted_status quoted_status user profile_background_tile',
333 |     'retweeted_status quoted_status user translator_type',
334 |     'retweeted_status quoted_status user profile_text_color',
335 |     'retweeted_status quoted_status in_reply_to_screen_name',
336 |     'retweeted_status quoted_status user notifications',
337 |     'retweeted_status quoted_status user url',
338 |     'retweeted_status quoted_status id_str',
339 |     'retweeted_status quoted_status entities hashtags',
340 |     'retweeted_status quoted_status favorite_count',
341 |     'retweeted_status quoted_status geo',
342 |     'retweeted_status quoted_status user lang',
343 |     'retweeted_status quoted_status user geo_enabled',
344 |     'retweeted_status quoted_status user profile_link_color',
345 |     'retweeted_status quoted_status filter_level',
346 |     'retweeted_status quoted_status user contributors_enabled',
347 |     'retweeted_status quoted_status entities urls',
348 |     'retweeted_status quoted_status user statuses_count',
349 |     'retweeted_status quoted_status user time_zone',
350 |     'quoted_status entities media',
351 |     'retweeted_status quoted_status entities media',
352 |     'retweeted_status quoted_status display_text_range',
353 |     'retweeted_status quoted_status extended_entities media',
354 |     'quoted_status extended_entities media',
355 |     'retweeted_status quoted_status possibly_sensitive',
356 |     'user derived klout influence_topics',
357 |     'user derived klout user_id',
358 |     'user derived klout profile_url',
359 |     'user derived klout interest_topics',
360 |     'user derived klout score',
361 |     'retweeted_status quoted_status place country',
362 |     'retweeted_status quoted_status place url',
363 |     'retweeted_status quoted_status place full_name',
364 |     'retweeted_status quoted_status extended_tweet entities urls',
365 |     'retweeted_status quoted_status extended_tweet entities symbols',
366 |     'retweeted_status quoted_status extended_tweet entities hashtags',
367 |     'retweeted_status quoted_status extended_tweet full_text',
368 |     'retweeted_status quoted_status place id',
369 |     'retweeted_status quoted_status place name',
370 |     'retweeted_status quoted_status extended_tweet display_text_range',
371 |     'retweeted_status quoted_status extended_tweet extended_entities media',
372 |     'retweeted_status quoted_status extended_tweet entities media',
373 |     'retweeted_status quoted_status extended_tweet entities user_mentions',
374 |     'retweeted_status quoted_status place bounding_box coordinates',
375 |     'retweeted_status quoted_status place bounding_box type',
376 |     'retweeted_status quoted_status place place_type',
377 |     'retweeted_status quoted_status place country_code',
378 |     'coordinates type',
379 |     'geo type',
380 |     'coordinates coordinates',
381 |     'geo coordinates',
382 |     'retweeted_status scopes followers',
383 |     'retweeted_status coordinates type', 
384 |     'retweeted_status geo type', 
385 |     'retweeted_status coordinates coordinates',
386 |     'retweeted_status geo coordinates',
387 |     'retweeted_status entities polls',
388 |     'retweeted_status quoted_status quoted_status_id',
389 |     'retweeted_status quoted_status quoted_status_id_str',
390 |     'quoted_status entities polls',
391 |     'retweeted_status quoted_status entities polls',
392 |     'quoted_status coordinates coordinates', 
393 |     'quoted_status geo coordinates', 
394 |     'quoted_status geo type', 
395 |     'quoted_status coordinates type',
396 |     'quoted_status user withheld_in_countries', 
397 |     'retweeted_status quoted_status user withheld_in_countries', 
398 |     'retweeted_status quoted_status withheld_in_countries', 
399 |     'quoted_status withheld_in_countries',
400 |     'quoted_status scopes followers'
401 |     }
402 | 
403 | original_format_minimum_set_keys = {
404 |     'contributors',
405 |     'created_at',
406 |     'entities hashtags',
407 |     'entities symbols',
408 |     'entities urls',
409 |     'entities user_mentions',
410 |     'favorite_count',
411 |     'favorited',
412 |     'filter_level',
413 |     'id',
414 |     'id_str',
415 |     'in_reply_to_screen_name',
416 |     'in_reply_to_status_id',
417 |     'in_reply_to_status_id_str',
418 |     'in_reply_to_user_id',
419 |     'in_reply_to_user_id_str',
420 |     'is_quote_status',
421 |     'lang',
422 |     'matching_rules',
423 |     'quote_count',
424 |     'reply_count',
425 |     'retweet_count',
426 |     'retweeted',
427 |     'source',
428 |     'text',
429 |     'truncated',
430 |     'user contributors_enabled',
431 |     'user created_at',
432 |     'user default_profile',
433 |     'user default_profile_image',
434 |     'user description',
435 |     'user favourites_count',
436 |     'user follow_request_sent',
437 |     'user followers_count',
438 |     'user following',
439 |     'user friends_count',
440 |     'user geo_enabled',
441 |     'user id',
442 |     'user id_str',
443 |     'user is_translator',
444 |     'user lang',
445 |     'user listed_count',
446 |     'user location',
447 |     'user name',
448 |     'user notifications',
449 |     'user profile_background_color',
450 |     'user profile_background_image_url',
451 |     'user profile_background_image_url_https',
452 |     'user profile_background_tile',
453 |     'user profile_image_url',
454 |     'user profile_image_url_https',
455 |     'user profile_link_color',
456 |     'user profile_sidebar_border_color',
457 |     'user profile_sidebar_fill_color',
458 |     'user profile_text_color',
459 |     'user profile_use_background_image',
460 |     'user protected',
461 |     'user screen_name',
462 |     'user statuses_count',
463 |     'user time_zone',
464 |     'user translator_type',
465 |     'user url',
466 |     'user utc_offset',
467 |     'user verified'}
468 | 
469 | 
470 | activity_streams_superset_keys = {
471 |     'actor displayName',
472 |     'actor favoritesCount',
473 |     'actor followersCount',
474 |     'actor friendsCount',
475 |     'actor id',
476 |     'actor image',
477 |     'actor languages',
478 |     'actor link',
479 |     'actor links',
480 |     'actor listedCount',
481 |     'actor objectType',
482 |     'actor postedTime',
483 |     'actor preferredUsername',
484 |     'actor statusesCount',
485 |     'actor summary',
486 |     'actor twitterTimeZone',
487 |     'actor utcOffset',
488 |     'actor verified',
489 |     'body',
490 |     'display_text_range',
491 |     'favoritesCount',
492 |     'generator displayName',
493 |     'generator link',
494 |     'gnip klout_profile klout_user_id',
495 |     'gnip klout_profile link',
496 |     'gnip klout_profile topics',
497 |     'gnip klout_score',
498 |     'gnip matching_rules',
499 |     'gnip urls',
500 |     'id',
501 |     'inReplyTo link',
502 |     'link',
503 |     'location country_code',
504 |     'location displayName',
505 |     'location geo coordinates',
506 |     'location geo type',
507 |     'location link',
508 |     'location name',
509 |     'location objectType',
510 |     'location twitter_country_code',
511 |     'location twitter_place_type',
512 |     'long_object body',
513 |     'long_object display_text_range',
514 |     'long_object twitter_entities hashtags',
515 |     'long_object twitter_entities media',
516 |     'long_object twitter_entities symbols',
517 |     'long_object twitter_entities urls',
518 |     'long_object twitter_entities user_mentions',
519 |     'long_object twitter_extended_entities media',
520 |     'object actor displayName',
521 |     'object actor favoritesCount',
522 |     'object actor followersCount',
523 |     'object actor friendsCount',
524 |     'object actor id',
525 |     'object actor image',
526 |     'object actor languages',
527 |     'object actor link',
528 |     'object actor links',
529 |     'object actor listedCount',
530 |     'object actor location displayName',
531 |     'object actor location objectType',
532 |     'object actor objectType',
533 |     'object actor postedTime',
534 |     'object actor preferredUsername',
535 |     'object actor statusesCount',
536 |     'object actor summary',
537 |     'object actor twitterTimeZone',
538 |     'object actor utcOffset',
539 |     'object actor verified',
540 |     'object body',
541 |     'object display_text_range',
542 |     'object favoritesCount',
543 |     'object generator displayName',
544 |     'object generator link',
545 |     'object id',
546 |     'object inReplyTo link',
547 |     'object link',
548 |     'object location country_code',
549 |     'object location displayName',
550 |     'object location geo coordinates',
551 |     'object location geo type',
552 |     'object location link',
553 |     'object location name',
554 |     'object location objectType',
555 |     'object location twitter_country_code',
556 |     'object location twitter_place_type',
557 |     'object long_object body',
558 |     'object long_object display_text_range',
559 |     'object long_object twitter_entities hashtags',
560 |     'object long_object twitter_entities media',
561 |     'object long_object twitter_entities symbols',
562 |     'object long_object twitter_entities urls',
563 |     'object long_object twitter_entities user_mentions',
564 |     'object long_object twitter_extended_entities media',
565 |     'object object id',
566 |     'object object link',
567 |     'object object objectType',
568 |     'object object postedTime',
569 |     'object object summary',
570 |     'object objectType',
571 |     'object postedTime',
572 |     'object provider displayName',
573 |     'object provider link',
574 |     'object provider objectType',
575 |     'object summary',
576 |     'object twitter_entities hashtags',
577 |     'object twitter_entities symbols',
578 |     'object twitter_entities urls',
579 |     'object twitter_entities user_mentions',
580 |     'object twitter_filter_level',
581 |     'object twitter_lang',
582 |     'object verb',
583 |     'objectType',
584 |     'postedTime',
585 |     'provider displayName',
586 |     'provider link',
587 |     'provider objectType',
588 |     'retweetCount',
589 |     'twitter_entities hashtags',
590 |     'twitter_entities media',
591 |     'twitter_entities symbols',
592 |     'twitter_entities urls',
593 |     'twitter_entities user_mentions',
594 |     'twitter_extended_entities media',
595 |     'twitter_filter_level',
596 |     'twitter_lang',
597 |     'twitter_quoted_status actor displayName',
598 |     'twitter_quoted_status actor favoritesCount',
599 |     'twitter_quoted_status actor followersCount',
600 |     'twitter_quoted_status actor friendsCount',
601 |     'twitter_quoted_status actor id',
602 |     'twitter_quoted_status actor image',
603 |     'twitter_quoted_status actor languages',
604 |     'twitter_quoted_status actor link',
605 |     'twitter_quoted_status actor links',
606 |     'twitter_quoted_status actor listedCount',
607 |     'twitter_quoted_status actor location displayName',
608 |     'twitter_quoted_status actor location objectType',
609 |     'twitter_quoted_status actor objectType',
610 |     'twitter_quoted_status actor postedTime',
611 |     'twitter_quoted_status actor preferredUsername',
612 |     'twitter_quoted_status actor statusesCount',
613 |     'twitter_quoted_status actor summary',
614 |     'twitter_quoted_status actor twitterTimeZone',
615 |     'twitter_quoted_status actor utcOffset',
616 |     'twitter_quoted_status actor verified',
617 |     'twitter_quoted_status body',
618 |     'twitter_quoted_status display_text_range',
619 |     'twitter_quoted_status favoritesCount',
620 |     'twitter_quoted_status generator displayName',
621 |     'twitter_quoted_status generator link',
622 |     'twitter_quoted_status id',
623 |     'twitter_quoted_status inReplyTo link',
624 |     'twitter_quoted_status link',
625 |     'twitter_quoted_status location country_code',
626 |     'twitter_quoted_status location displayName',
627 |     'twitter_quoted_status location geo coordinates',
628 |     'twitter_quoted_status location geo type',
629 |     'twitter_quoted_status location link',
630 |     'twitter_quoted_status location name',
631 |     'twitter_quoted_status location objectType',
632 |     'twitter_quoted_status location twitter_country_code',
633 |     'twitter_quoted_status location twitter_place_type',
634 |     'twitter_quoted_status long_object body',
635 |     'twitter_quoted_status long_object display_text_range',
636 |     'twitter_quoted_status long_object twitter_entities hashtags',
637 |     'twitter_quoted_status long_object twitter_entities media',
638 |     'twitter_quoted_status long_object twitter_entities symbols',
639 |     'twitter_quoted_status long_object twitter_entities urls',
640 |     'twitter_quoted_status long_object twitter_entities user_mentions',
641 |     'twitter_quoted_status long_object twitter_extended_entities media',
642 |     'twitter_quoted_status object id',
643 |     'twitter_quoted_status object link',
644 |     'twitter_quoted_status object objectType',
645 |     'twitter_quoted_status object postedTime',
646 |     'twitter_quoted_status object summary',
647 |     'twitter_quoted_status objectType',
648 |     'twitter_quoted_status postedTime',
649 |     'twitter_quoted_status provider displayName',
650 |     'twitter_quoted_status provider link',
651 |     'twitter_quoted_status provider objectType',
652 |     'twitter_quoted_status twitter_entities hashtags',
653 |     'twitter_quoted_status twitter_entities symbols',
654 |     'twitter_quoted_status twitter_entities urls',
655 |     'twitter_quoted_status twitter_entities user_mentions',
656 |     'twitter_quoted_status twitter_filter_level',
657 |     'twitter_quoted_status twitter_lang',
658 |     'twitter_quoted_status verb',
659 |     'verb',
660 |     'object twitter_entities media',
661 |     'object twitter_extended_entities media',
662 |     'actor location displayName',
663 |     'gnip profileLocations',
664 |     'actor location objectType',
665 |     'geo coordinates',
666 |     'geo type',
667 |     'twitter_quoted_status twitter_entities media',
668 |     'twitter_quoted_status twitter_extended_entities media',
669 |     'object geo coordinates',
670 |     'object geo type'}
671 | 
672 | activity_streams_minimum_set_keys = {
673 |     'actor displayName',
674 |     'actor favoritesCount',
675 |     'actor followersCount',
676 |     'actor friendsCount',
677 |     'actor id',
678 |     'actor image',
679 |     'actor languages',
680 |     'actor link',
681 |     'actor links',
682 |     'actor listedCount',
683 |     'actor objectType',
684 |     'actor postedTime',
685 |     'actor preferredUsername',
686 |     'actor statusesCount',
687 |     'actor summary',
688 |     'actor twitterTimeZone',
689 |     'actor utcOffset',
690 |     'actor verified',
691 |     'body',
692 |     'favoritesCount',
693 |     'generator displayName',
694 |     'generator link',
695 |     'gnip matching_rules',
696 |     'id',
697 |     'link',
698 |     'object id',
699 |     'object link',
700 |     'object objectType',
701 |     'object postedTime',
702 |     'objectType',
703 |     'postedTime',
704 |     'provider displayName',
705 |     'provider link',
706 |     'provider objectType',
707 |     'retweetCount',
708 |     'twitter_entities hashtags',
709 |     'twitter_entities symbols',
710 |     'twitter_entities urls',
711 |     'twitter_entities user_mentions',
712 |     'twitter_filter_level',
713 |     'twitter_lang',
714 |     'verb'}
715 | 
716 | 


--------------------------------------------------------------------------------
/tweet_parser/tweet_parser_errors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright 2018 Twitter, Inc.
 3 | # Licensed under the MIT License
 4 | # https://opensource.org/licenses/MIT
 5 | class NotATweetError(Exception):
 6 |     pass
 7 | 
 8 | 
 9 | class NotAvailableError(Exception):
10 |     pass
11 | 
12 | 
13 | class UnexpectedFormatError(Exception):
14 |     pass
15 | 


--------------------------------------------------------------------------------