├── .github
    └── workflows
    │   ├── publish.yml
    │   ├── readme-toc.yml
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── email.png
├── setup.py
├── tests
    ├── __init__.py
    ├── test_import.py
    ├── test_migrations.py
    ├── test_save_tweets.py
    ├── tweets.json
    ├── utils.py
    └── zip_contents
    │   ├── account-suspension.js
    │   ├── account.js
    │   ├── app.js
    │   ├── follower.js
    │   ├── following.js
    │   └── saved-search.js
└── twitter_to_sqlite
    ├── __init__.py
    ├── archive.py
    ├── cli.py
    ├── migrations.py
    └── utils.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   test:
 9 |     runs-on: ubuntu-latest
10 |     strategy:
11 |       matrix:
12 |         python-version: [3.6, 3.7, 3.8, 3.9]
13 |     steps:
14 |     - uses: actions/checkout@v2
15 |     - name: Set up Python ${{ matrix.python-version }}
16 |       uses: actions/setup-python@v2
17 |       with:
18 |         python-version: ${{ matrix.python-version }}
19 |     - uses: actions/cache@v2
20 |       name: Configure pip caching
21 |       with:
22 |         path: ~/.cache/pip
23 |         key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
24 |         restore-keys: |
25 |           ${{ runner.os }}-pip-
26 |     - name: Install dependencies
27 |       run: |
28 |         pip install -e '.[test]'
29 |     - name: Run tests
30 |       run: |
31 |         pytest
32 |   deploy:
33 |     runs-on: ubuntu-latest
34 |     needs: [test]
35 |     steps:
36 |     - uses: actions/checkout@v2
37 |     - name: Set up Python
38 |       uses: actions/setup-python@v2
39 |       with:
40 |         python-version: '3.9'
41 |     - uses: actions/cache@v2
42 |       name: Configure pip caching
43 |       with:
44 |         path: ~/.cache/pip
45 |         key: ${{ runner.os }}-publish-pip-${{ hashFiles('**/setup.py') }}
46 |         restore-keys: |
47 |           ${{ runner.os }}-publish-pip-
48 |     - name: Install dependencies
49 |       run: |
50 |         pip install setuptools wheel twine
51 |     - name: Publish
52 |       env:
53 |         TWINE_USERNAME: __token__
54 |         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
55 |       run: |
56 |         python setup.py sdist bdist_wheel
57 |         twine upload dist/*
58 | 
59 | 


--------------------------------------------------------------------------------
/.github/workflows/readme-toc.yml:
--------------------------------------------------------------------------------
 1 | name: Update README table of contents
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |     - master
 8 |     paths:
 9 |     - README.md
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |     - name: Check out repo
16 |       uses: actions/checkout@v2
17 |     - name: Update TOC
18 |       run: npx markdown-toc README.md -i
19 |     - name: Commit and push if README changed
20 |       run: |-
21 |         git diff
22 |         git config --global user.email "readme-bot@example.com"
23 |         git config --global user.name "README-bot"
24 |         git diff --quiet || (git add README.md && git commit -m "Updated README")
25 |         git push
26 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: [3.6, 3.7, 3.8, 3.9]
11 |     steps:
12 |     - uses: actions/checkout@v2
13 |     - name: Set up Python ${{ matrix.python-version }}
14 |       uses: actions/setup-python@v2
15 |       with:
16 |         python-version: ${{ matrix.python-version }}
17 |     - uses: actions/cache@v2
18 |       name: Configure pip caching
19 |       with:
20 |         path: ~/.cache/pip
21 |         key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
22 |         restore-keys: |
23 |           ${{ runner.os }}-pip-
24 |     - name: Install dependencies
25 |       run: |
26 |         pip install -e '.[test]'
27 |     - name: Run tests
28 |       run: |
29 |         pytest
30 | 
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv
 2 | .DS_Store
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | venv
 7 | .eggs
 8 | .pytest_cache
 9 | *.egg-info
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # twitter-to-sqlite
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/twitter-to-sqlite.svg)](https://pypi.org/project/twitter-to-sqlite/)
  4 | [![Changelog](https://img.shields.io/github/v/release/dogsheep/twitter-to-sqlite?include_prereleases&label=changelog)](https://github.com/dogsheep/twitter-to-sqlite/releases)
  5 | [![Tests](https://github.com/dogsheep/twitter-to-sqlite/workflows/Test/badge.svg)](https://github.com/dogsheep/twitter-to-sqlite/actions?query=workflow%3ATest)
  6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/dogsheep/twitter-to-sqlite/blob/main/LICENSE)
  7 | 
  8 | Save data from Twitter to a SQLite database.
  9 | 
 10 | **This tool currently uses Twitter API v1**. You may be unable to use it if you do not have an API key for that version of the API.
 11 | 
 12 | <!-- toc -->
 13 | 
 14 | - [How to install](#how-to-install)
 15 | - [Authentication](#authentication)
 16 | - [Retrieving tweets by specific accounts](#retrieving-tweets-by-specific-accounts)
 17 | - [Retrieve user profiles in bulk](#retrieve-user-profiles-in-bulk)
 18 | - [Retrieve tweets in bulk](#retrieve-tweets-in-bulk)
 19 | - [Retrieving Twitter followers](#retrieving-twitter-followers)
 20 | - [Retrieving friends](#retrieving-friends)
 21 | - [Retrieving favorited tweets](#retrieving-favorited-tweets)
 22 | - [Retrieving Twitter lists](#retrieving-twitter-lists)
 23 | - [Retrieving Twitter list memberships](#retrieving-twitter-list-memberships)
 24 | - [Retrieving just follower and friend IDs](#retrieving-just-follower-and-friend-ids)
 25 | - [Retrieving tweets from your home timeline](#retrieving-tweets-from-your-home-timeline)
 26 | - [Retrieving your mentions](#retrieving-your-mentions)
 27 | - [Providing input from a SQL query with --sql and --attach](#providing-input-from-a-sql-query-with---sql-and---attach)
 28 | - [Running searches](#running-searches)
 29 | - [Capturing tweets in real-time with track and follow](#capturing-tweets-in-real-time-with-track-and-follow)
 30 |   * [track](#track)
 31 |   * [follow](#follow)
 32 | - [Importing data from your Twitter archive](#importing-data-from-your-twitter-archive)
 33 | - [Design notes](#design-notes)
 34 | 
 35 | <!-- tocstop -->
 36 | 
 37 | ## How to install
 38 | 
 39 |     $ pip install twitter-to-sqlite
 40 | 
 41 | ## Authentication
 42 | 
 43 | First, you will need to create a Twitter application at https://developer.twitter.com/en/apps. You may need to apply for a Twitter developer account - if so, you may find this [example of an email application](https://raw.githubusercontent.com/dogsheep/twitter-to-sqlite/main/email.png) useful that has been approved in the past.
 44 | 
 45 | Once you have created your application, navigate to the "Keys and tokens" page and make note of the following:
 46 | 
 47 | * Your API key
 48 | * Your API secret key
 49 | * Your access token
 50 | * Your access token secret
 51 | 
 52 | You will need to save all four of these values to a JSON file in order to use this tool.
 53 | 
 54 | You can create that JSON file by running the following command and pasting in the values at the prompts:
 55 | 
 56 |     $ twitter-to-sqlite auth
 57 |     Create an app here: https://developer.twitter.com/en/apps
 58 |     Then navigate to 'Keys and tokens' and paste in the following:
 59 | 
 60 |     API key: xxx
 61 |     API secret key: xxx
 62 |     Access token: xxx
 63 |     Access token secret: xxx
 64 | 
 65 | This will create a file called `auth.json` in your current directory containing the required values. To save the file at a different path or filename, use the `--auth=myauth.json` option.
 66 | 
 67 | ## Retrieving tweets by specific accounts
 68 | 
 69 | The `user-timeline` command retrieves all of the tweets posted by the specified user accounts. It defaults to the account belonging to the authenticated user:
 70 | 
 71 |     $ twitter-to-sqlite user-timeline twitter.db
 72 |     Importing tweets  [#####-------------------------------]  2799/17780  00:01:39
 73 | 
 74 | All of these commands assume that there is an `auth.json` file in the current directory. You can provide the path to your `auth.json` file using `-a`:
 75 | 
 76 |     $ twitter-to-sqlite user-timeline twitter.db -a /path/to/auth.json
 77 | 
 78 | To load tweets for other users, pass their screen names as arguments:
 79 | 
 80 |     $ twitter-to-sqlite user-timeline twitter.db cleopaws nichemuseums
 81 | 
 82 | Twitter's API only returns up to around 3,200 tweets for most user accounts, but you may find that it returns all available tweets for your own user account.
 83 | 
 84 | You can pass numeric Twitter user IDs instead of screen names using the `--ids` parameter.
 85 | 
 86 | You can use `--since` to retrieve every tweet since the last time you imported for that user, or `--since_id=xxx` to retrieve every tweet since a specific tweet ID.
 87 | 
 88 | This command also accepts `--sql` and `--attach` options, documented below.
 89 | 
 90 | ## Retrieve user profiles in bulk
 91 | 
 92 | If you have a list of Twitter screen names (or user IDs) you can bulk fetch their fully inflated Twitter profiles using the `users-lookup` command:
 93 | 
 94 |     $ twitter-to-sqlite users-lookup users.db simonw cleopaws
 95 | 
 96 | You can pass user IDs instead using the `--ids` option:
 97 | 
 98 |     $ twitter-to-sqlite users-lookup users.db 12497 3166449535 --ids
 99 | 
100 | This command also accepts `--sql` and `--attach` options, documented below.
101 | 
102 | ## Retrieve tweets in bulk
103 | 
104 | If you have a list of tweet IDS you can bulk fetch them using the `statuses-lookup` command:
105 | 
106 |     $ twitter-to-sqlite statuses-lookup tweets.db 1122154819815239680 1122154178493575169
107 | 
108 | The `--sql` and `--attach` options are supported.
109 | 
110 | Here's a recipe to retrieve any tweets that existing tweets are in-reply-to which have not yet been stored in your database:
111 | 
112 |     $ twitter-to-sqlite statuses-lookup tweets.db \
113 |         --sql='
114 |             select in_reply_to_status_id
115 |             from tweets
116 |             where in_reply_to_status_id is not null' \
117 |         --skip-existing
118 | 
119 | The `--skip-existing` option means that tweets that have already been stored in the database will not be fetched again.
120 | 
121 | ## Retrieving Twitter followers
122 | 
123 | The `followers` command retrieves details of every follower of the specified accounts. You can use it to retrieve your own followers, or you can pass one or more screen names to pull the followers for other accounts.
124 | 
125 | The following command pulls your followers and saves them in a SQLite database file called `twitter.db`:
126 | 
127 |     $ twitter-to-sqlite followers twitter.db
128 | 
129 | This command is **extremely slow**, because Twitter impose a rate limit of no more than one request per minute to this endpoint! If you are running it against an account with thousands of followers you should expect this to take several hours.
130 | 
131 | To retrieve followers for another account, use:
132 | 
133 |     $ twitter-to-sqlite followers twitter.db cleopaws
134 | 
135 | This command also accepts the `--ids`, `--sql` and `--attach` options.
136 | 
137 | See [Analyzing my Twitter followers with Datasette](https://simonwillison.net/2018/Jan/28/analyzing-my-twitter-followers/) for the original inspiration for this command.
138 | 
139 | ## Retrieving friends
140 | 
141 | The `friends` command works like the `followers` command, but retrieves the specified (or currently authenticated) user's friends - defined as accounts that the user is following.
142 | 
143 |     $ twitter-to-sqlite friends twitter.db
144 | 
145 | It takes the same options as the `followers` command.
146 | 
147 | ## Retrieving favorited tweets
148 | 
149 | The `favorites` command retrieves tweets that have been favorited by a specified user. Called without any extra arguments it retrieves tweets favorited by the currently authenticated user:
150 | 
151 |     $ twitter-to-sqlite favorites faves.db
152 | 
153 | You can also use the `--screen_name` or `--user_id` arguments to retrieve favorite tweets for another user:
154 | 
155 |     $ twitter-to-sqlite favorites faves-obama.db --screen_name=BarackObama
156 | 
157 | Use the `--stop_after=xxx` argument to retrieve only the most recent number of favorites, e.g. to get the authenticated user's 50 most recent favorites:
158 | 
159 |     $ twitter-to-sqlite favorites faves.db --stop_after=50
160 | 
161 | ## Retrieving Twitter lists
162 | 
163 | The `lists` command retrieves all of the lists belonging to one or more users.
164 | 
165 |     $ twitter-to-sqlite lists lists.db simonw dogsheep
166 | 
167 | This command also accepts the `--sql` and `--attach` and `--ids` options.
168 | 
169 | To additionally fetch the list of members for each list, use `--members`.
170 | 
171 | ## Retrieving Twitter list memberships
172 | 
173 | The `list-members` command can be used to retrieve details of one or more Twitter lists, including all of their members.
174 | 
175 |     $ twitter-to-sqlite list-members members.db simonw/the-good-place
176 | 
177 | You can pass multiple `screen_name/list_slug` identifiers.
178 | 
179 | If you know the numeric IDs of the lists instead, you can use `--ids`:
180 | 
181 |     $ twitter-to-sqlite list-members members.db 927913322841653248 --ids
182 | 
183 | ## Retrieving just follower and friend IDs
184 | 
185 | It's also possible to retrieve just the numeric Twitter IDs of the accounts that specific users are following ("friends" in Twitter's API terminology) or followed-by:
186 | 
187 |     $ twitter-to-sqlite followers-ids members.db simonw cleopaws
188 | 
189 | This will populate the `following` table with `followed_id`/`follower_id` pairs for the two specified accounts, listing every account ID that is following either of those two accounts.
190 | 
191 |     $ twitter-to-sqlite friends-ids members.db simonw cleopaws
192 | 
193 | This will do the same thing but pull the IDs that those accounts are following.
194 | 
195 | Both of these commands also support `--sql` and `--attach` as an alternative to passing screen names as direct command-line arguments. You can use `--ids` to process the inputs as user IDs rather than screen names.
196 | 
197 | The underlying Twitter APIs have a rate limit of 15 requests every 15 minutes - though they do return up to 5,000 IDs in each call. By default both of these subcommands will wait for 61 seconds between API calls in order to stay within the rate limit - you can adjust this behaviour down to just one second delay if you know you will not be making many calls using `--sleep=1`.
198 | 
199 | ## Retrieving tweets from your home timeline
200 | 
201 | The `home-timeline` command retrieves up to 800 tweets from the home timeline of the authenticated user - generally this means tweets from people you follow.
202 | 
203 |     $ twitter-to-sqlite home-timeline twitter.db
204 |     Importing timeline  [#################--------]  591/800  00:01:14
205 | 
206 | The tweets are stored in the `tweets` table, and a record is added to the `timeline_tweets` table noting that this tweet came in due to being spotted in the timeline of your user.
207 | 
208 | You can use `--since` to retrieve just tweets that have been posted since the last time this command was run, or `--since_id=xxx` to explicitly pass in a tweet ID to use as the last position.
209 | 
210 | You can then view your timeline in Datasette using the following URL:
211 | 
212 | `/tweets/tweets?_where=id+in+(select+tweet+from+[timeline_tweets])&_sort_desc=id&_facet=user`
213 | 
214 | This will filter your tweets table to just tweets that appear in your timeline, ordered by most recent first and use faceting to show you which users are responsible for the most tweets.
215 | 
216 | ## Retrieving your mentions
217 | 
218 | The `mentions-timeline` command works like `home-timeline` except it retrieves tweets that mention the authenticated user's account. It records the user account that was mentioned in a `mentions_tweets` table.
219 | 
220 | It supports `--since` and `--since_id` in the same was as `home-timeline` does.
221 | 
222 | ## Providing input from a SQL query with --sql and --attach
223 | 
224 | This option is available for some subcommands - run `twitter-to-sqlite command-name --help` to check.
225 | 
226 | You can provide Twitter screen names (or user IDs or tweet IDs) directly as command-line arguments, or you can provide those screen names or IDs by executing a SQL query.
227 | 
228 | For example: consider a SQLite database with an `attendees` table listing names and Twitter accounts - something like this:
229 | 
230 | | First   | Last       | Twitter      |
231 | |---------|------------|--------------|
232 | | Simon   | Willison   | simonw       |
233 | | Avril   | Lavigne    | AvrilLavigne |
234 | 
235 | You can run the `users-lookup` command to pull the Twitter profile of every user listed in that database by loading the screen names using a `--sql` query:
236 | 
237 |     $ twitter-to-sqlite users-lookup my.db --sql="select Twitter from attendees"
238 | 
239 | If your database table contains Twitter IDs, you can select those IDs and pass the `--ids` argument. For example, to fetch the profiles of users who have had their user IDs inserted into the `following` table using the `twitter-to-sqlite friends-ids` command:
240 | 
241 |     $ twitter-to-sqlite users-lookup my.db --sql="select follower_id from following" --ids
242 | 
243 | Or to avoid re-fetching users that have already been fetched:
244 | 
245 |     $ twitter-to-sqlite users-lookup my.db \
246 |         --sql="select followed_id from following where followed_id not in (
247 |             select id from users)" --ids
248 | 
249 | If your data lives in a separate database file you can attach it using `--attach`. For example, consider the attendees example above but the data lives in an `attendees.db` file, and you want to fetch the user profiles into a `tweets.db` file. You could do that like this:
250 | 
251 |     $ twitter-to-sqlite users-lookup tweets.db \
252 |         --attach=attendees.db \
253 |         --sql="select Twitter from attendees.attendees"
254 | 
255 | The filename (without the extension) will be used as the database alias within SQLite. If you want a different alias for some reason you can specify that with a colon like this:
256 | 
257 |     $ twitter-to-sqlite users-lookup tweets.db \
258 |         --attach=foo:attendees.db \
259 |         --sql="select Twitter from foo.attendees"
260 | 
261 | ## Running searches
262 | 
263 | The `search` command runs a search against the Twitter [standard search API](https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets).
264 | 
265 |     $ twitter-to-sqlite search tweets.db "dogsheep"
266 | 
267 | This will import up to around 320 tweets that match that search term into the `tweets` table. It will also create a record in the `search_runs` table recording that the search took place, and many-to-many records in the `search_runs_tweets` table recording which tweets were seen for that search at that time.
268 | 
269 | You can use the `--since` parameter to check for previous search runs with the same arguments and only retrieve tweets that were posted since the last retrieved matching tweet.
270 | 
271 | The following additional options for `search` are supported:
272 | 
273 | * `--geocode`: `latitude,longitude,radius` where radius is a number followed by mi or km
274 | * `--lang`: ISO 639-1 language code e.g. `en` or `es`
275 | * `--locale`: Locale: only `ja` is currently effective
276 | * `--result_type`: `mixed`, `recent` or `popular`. Defaults to `mixed`
277 | * `--count`: Number of results per page, defaults to the maximum of 100
278 | * `--stop_after`: Stop after this many results
279 | * `--since_id`: Pull tweets since this Tweet ID. You probably want to use `--since` instead of this.
280 | 
281 | ## Capturing tweets in real-time with track and follow
282 | 
283 | This functionality is **experimental**. Please [file bug reports](https://github.com/dogsheep/twitter-to-sqlite/issues) if you find any!
284 | 
285 | Twitter provides a real-time API which can be used to subscribe to tweets as they happen. `twitter-to-sqlite` can use this API to continually update a SQLite database with tweets matching certain keywords, or referencing specific users.
286 | 
287 | ### track
288 | 
289 | To track keywords, use the `track` command:
290 | 
291 |     $ twitter-to-sqlite track tweets.db kakapo
292 | 
293 | This command will continue to run until you hit Ctrl+C. It will capture any tweets mentioning the keyword [kakapo](https://en.wikipedia.org/wiki/Kakapo) and store them in the `tweets.db` database file.
294 | 
295 | You can pass multiple keywords as a space separated list. This will capture tweets matching either of those keywords:
296 | 
297 |     $ twitter-to-sqlite track tweets.db kakapo raccoon
298 | 
299 | You can enclose phrases in quotes to search for tweets matching both of those keywords:
300 | 
301 |     $ twitter-to-sqlite track tweets.db 'trash panda'
302 | 
303 | See [the Twitter track documentation](https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/basic-stream-parameters#track) for advanced tips on using this command.
304 | 
305 | Add the `--verbose` option to see matching tweets (in their verbose JSON form) displayed to the terminal as they are captured:
306 | 
307 |     $ twitter-to-sqlite track tweets.db raccoon --verbose
308 | 
309 | ### follow
310 | 
311 | The `follow` command will capture all tweets that are relevant to one or more specific Twitter users.
312 | 
313 |     $ twitter-to-sqlite follow tweets.db nytimes
314 | 
315 | This includes tweets by those users, tweets that reply to or quote those users and retweets by that user. See [the Twitter follow documentation](https://developer.twitter.com/en/docs/tweets/filter-realtime/guides/basic-stream-parameters#follow) for full details.
316 | 
317 | The command accepts one or more screen names.
318 | 
319 | You can feed it numeric Twitter user IDs instead of screen names by using the `--ids` flag.
320 | 
321 | The command also supports the `--sql` and `--attach` options, and the `--verbose` option for displaying tweets as they are captured.
322 | 
323 | Here's how to start following tweets from every user ID currently represented as being followed in the `following` table (populated using the `friends-ids` command):
324 | 
325 |     $ twitter-to-sqlite follow tweets.db \
326 |         --sql="select distinct followed_id from following" \
327 |         --ids
328 | 
329 | ## Importing data from your Twitter archive
330 | 
331 | You can request an archive of your Twitter data by [following these instructions](https://help.twitter.com/en/managing-your-account/how-to-download-your-twitter-archive).
332 | 
333 | Twitter will send you a link to download a `.zip` file. You can import the contents of that file into a set of tables in a new database file called `archive.db` (each table beginning with the `archive_` prefix) using the `import` command:
334 | 
335 |     $ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2.zip
336 | 
337 | This command does not populate any of the regular tables, since Twitter's export data does not exactly match the schema returned by the Twitter API.
338 | 
339 | It will delete and recreate the corresponding `archive_*` tables every time you run it. If this is not what you want, run the command against a new SQLite database file name rather than running it against one that already exists.
340 | 
341 | If you have already decompressed your archive, you can run this against the directory that you decompressed it to:
342 | 
343 |     $ twitter-to-sqlite import archive.db ~/Downloads/twitter-2019-06-25-b31f2/
344 | 
345 | You can also run it against one or more specific files within that folder. For example, to import just the follower.js and following.js files:
346 | 
347 |     $ twitter-to-sqlite import archive.db \
348 |         ~/Downloads/twitter-2019-06-25-b31f2/follower.js \
349 |         ~/Downloads/twitter-2019-06-25-b31f2/following.js
350 | 
351 | You may want to use other commands to populate tables based on data from the archive. For example, to retrieve full API versions of each of the tweets you have favourited in your archive, you could run the following:
352 | 
353 |     $ twitter-to-sqlite statuses-lookup archive.db \
354 |         --sql='select tweetId from archive_like' \
355 |         --skip-existing
356 | 
357 | If you want these imported tweets to then be reflected in the `favorited_by` table, you can do so by applying the following SQL query:
358 | 
359 |     $ sqlite3 archive.db
360 |     SQLite version 3.22.0 2018-01-22 18:45:57
361 |     Enter ".help" for usage hints.
362 |     sqlite> INSERT OR IGNORE INTO favorited_by (tweet, user)
363 |        ...>     SELECT tweetId, 'YOUR_TWITTER_ID' FROM archive_like;
364 |     <Ctrl+D>
365 | 
366 | Replace YOUR_TWITTER_ID with your numeric Twitter ID. If you don't know that ID you can find it out by running the following:
367 | 
368 |     $ twitter-to-sqlite fetch \
369 |         "https://api.twitter.com/1.1/account/verify_credentials.json" \
370 |         | grep '"id"' | head -n 1
371 | 
372 | ## Design notes
373 | 
374 | * Tweet IDs are stored as integers, to afford sorting by ID in a sensible way
375 | * While we configure foreign key relationships between tables, we do not ask SQLite to enforce them. This is used by the `following` table to allow the `followers-ids` and `friends-ids` commands to populate it with user IDs even if the user accounts themselves are not yet present in the `users` table.
376 | 


--------------------------------------------------------------------------------
/email.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dogsheep/twitter-to-sqlite/f09d611782a8372cfb002792dfa727325afb4db6/email.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import os
 3 | 
 4 | VERSION = "0.22"
 5 | 
 6 | 
 7 | def get_long_description():
 8 |     with open(
 9 |         os.path.join(os.path.dirname(os.path.abspath(__file__)), "README.md"),
10 |         encoding="utf8",
11 |     ) as fp:
12 |         return fp.read()
13 | 
14 | 
15 | setup(
16 |     name="twitter-to-sqlite",
17 |     description="Save data from Twitter to a SQLite database",
18 |     long_description=get_long_description(),
19 |     long_description_content_type="text/markdown",
20 |     author="Simon Willison",
21 |     url="https://datasette.io/tools/twitter-to-sqlite",
22 |     project_urls={
23 |         "Issues": "https://github.com/dogsheep/twitter-to-sqlite/issues",
24 |         "CI": "https://github.com/dogsheep/twitter-to-sqlite/actions",
25 |         "Changelog": "https://github.com/dogsheep/twitter-to-sqlite/releases",
26 |     },
27 |     license="Apache License, Version 2.0",
28 |     version=VERSION,
29 |     packages=["twitter_to_sqlite"],
30 |     entry_points="""
31 |         [console_scripts]
32 |         twitter-to-sqlite=twitter_to_sqlite.cli:cli
33 |     """,
34 |     install_requires=[
35 |         "sqlite-utils>=2.4.2",
36 |         "requests-oauthlib~=1.2.0",
37 |         "python-dateutil",
38 |     ],
39 |     extras_require={"test": ["pytest"]},
40 |     tests_require=["twitter-to-sqlite[test]"],
41 | )
42 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dogsheep/twitter-to-sqlite/f09d611782a8372cfb002792dfa727325afb4db6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_import.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import pathlib
  3 | 
  4 | import pytest
  5 | import sqlite_utils
  6 | from click.testing import CliRunner
  7 | from twitter_to_sqlite import cli
  8 | 
  9 | from .utils import create_zip
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def zip_contents_path():
 14 |     return pathlib.Path(__file__).parent / "zip_contents"
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def import_test_zip(tmpdir, zip_contents_path):
 19 |     archive = str(tmpdir / "archive.zip")
 20 |     buf = io.BytesIO()
 21 |     zf = create_zip(zip_contents_path, buf)
 22 |     zf.close()
 23 |     open(archive, "wb").write(buf.getbuffer())
 24 |     return tmpdir, archive
 25 | 
 26 | 
 27 | def test_create_zip(zip_contents_path):
 28 |     zf = create_zip(zip_contents_path)
 29 |     assert {
 30 |         "account-suspension.js",
 31 |         "account.js",
 32 |         "app.js",
 33 |         "saved-search.js",
 34 |         "following.js",
 35 |         "follower.js",
 36 |     } == {f.filename for f in zf.filelist}
 37 | 
 38 | 
 39 | def test_cli_import_zip_file(import_test_zip):
 40 |     tmpdir, archive = import_test_zip
 41 |     output = str(tmpdir / "output.db")
 42 |     result = CliRunner().invoke(cli.cli, ["import", output, archive])
 43 |     assert 0 == result.exit_code, result.stdout
 44 |     db = sqlite_utils.Database(output)
 45 |     assert_imported_db(db)
 46 | 
 47 | 
 48 | def test_cli_import_folder(tmpdir, zip_contents_path):
 49 |     output = str(tmpdir / "output.db")
 50 |     result = CliRunner().invoke(cli.cli, ["import", output, str(zip_contents_path)])
 51 |     assert 0 == result.exit_code, result.stdout
 52 |     db = sqlite_utils.Database(output)
 53 |     assert_imported_db(db)
 54 | 
 55 | 
 56 | def test_cli_import_specific_files(tmpdir, zip_contents_path):
 57 |     output = str(tmpdir / "output.db")
 58 |     result = CliRunner().invoke(
 59 |         cli.cli,
 60 |         [
 61 |             "import",
 62 |             output,
 63 |             str(zip_contents_path / "follower.js"),
 64 |             str(zip_contents_path / "following.js"),
 65 |         ],
 66 |     )
 67 |     assert 0 == result.exit_code, result.stdout
 68 |     db = sqlite_utils.Database(output)
 69 |     # Should just have two tables
 70 |     assert ["archive_follower", "archive_following"] == db.table_names()
 71 | 
 72 | 
 73 | def assert_imported_db(db):
 74 |     assert {
 75 |         "archive_follower",
 76 |         "archive_saved_search",
 77 |         "archive_account",
 78 |         "archive_app",
 79 |         "archive_following",
 80 |     } == set(db.table_names())
 81 | 
 82 |     assert [{"accountId": "73747798"}, {"accountId": "386025404"}] == list(
 83 |         db["archive_follower"].rows
 84 |     )
 85 |     assert [{"accountId": "547842573"}, {"accountId": "12158"}] == list(
 86 |         db["archive_following"].rows
 87 |     )
 88 | 
 89 |     assert [{"appId": "1380676511", "appNames": '["BBC Sounds"]'}] == list(
 90 |         db["archive_app"].rows
 91 |     )
 92 | 
 93 |     assert [
 94 |         {"savedSearchId": "42214", "query": "simonw"},
 95 |         {"savedSearchId": "55814", "query": "django"},
 96 |     ] == list(db["archive_saved_search"].rows)
 97 |     assert [
 98 |         {
 99 |             "pk": "c4e32e91742df2331ef3ad1e481d1a64d781183a",
100 |             "phoneNumber": "+15555555555",
101 |             "email": "swillison@example.com",
102 |             "createdVia": "web",
103 |             "username": "simonw",
104 |             "accountId": "12497",
105 |             "createdAt": "2006-11-15T13:18:50.000Z",
106 |             "accountDisplayName": "Simon Willison",
107 |         }
108 |     ] == list(db["archive_account"].rows)
109 | 
110 | 
111 | def test_deletes_existing_archive_tables(import_test_zip):
112 |     tmpdir, archive = import_test_zip
113 |     output = str(tmpdir / "output.db")
114 |     db = sqlite_utils.Database(output)
115 |     # Create a table
116 |     db["archive_follower"].create({"id": int})
117 |     db["archive_follower"].insert({"id": 1})
118 |     assert ["archive_follower"] == db.table_names()
119 |     assert [{"id": 1}] == list(db["archive_follower"].rows)
120 |     assert (
121 |         "CREATE TABLE [archive_follower] (\n   [id] INTEGER\n)"
122 |         == db["archive_follower"].schema
123 |     )
124 |     # Running the import should wipe and recreate that table
125 |     CliRunner().invoke(cli.cli, ["import", output, archive])
126 |     # That table should have been deleted and recreated
127 |     assert (
128 |         "CREATE TABLE [archive_follower] (\n   [accountId] TEXT PRIMARY KEY\n)"
129 |         == db["archive_follower"].schema
130 |     )
131 |     assert 2 == db["archive_follower"].count
132 | 


--------------------------------------------------------------------------------
/tests/test_migrations.py:
--------------------------------------------------------------------------------
 1 | import sqlite_utils
 2 | from click.testing import CliRunner
 3 | import sqlite_utils
 4 | from twitter_to_sqlite import cli, migrations
 5 | 
 6 | from .test_import import zip_contents_path
 7 | from .test_save_tweets import db, tweets
 8 | 
 9 | 
10 | def test_no_migrations_on_first_run(tmpdir, zip_contents_path):
11 |     output = str(tmpdir / "output.db")
12 |     args = ["import", output, str(zip_contents_path / "follower.js")]
13 |     result = CliRunner().invoke(cli.cli, args)
14 |     assert 0 == result.exit_code, result.stdout
15 |     db = sqlite_utils.Database(output)
16 |     assert ["archive_follower"] == db.table_names()
17 |     # Re-running the command again should also run the migrations
18 |     result = CliRunner().invoke(cli.cli, args)
19 |     db = sqlite_utils.Database(output)
20 |     assert {"archive_follower", "migrations"} == set(db.table_names())
21 | 
22 | 
23 | def test_convert_source_column():
24 |     db = sqlite_utils.Database(memory=True)
25 |     db["tweets"].insert_all(
26 |         [
27 |             {"id": 1, "source": '<a href="URL">NAME</a>'},
28 |             {"id": 2, "source": '<a href="URL2">NAME2</a>'},
29 |             {"id": 3, "source": "d3c1d39c57fecfc09202f20ea5e2db30262029fd"},
30 |         ],
31 |         pk="id",
32 |     )
33 |     migrations.convert_source_column(db)
34 |     assert [
35 |         {
36 |             "id": "d3c1d39c57fecfc09202f20ea5e2db30262029fd",
37 |             "url": "URL",
38 |             "name": "NAME",
39 |         },
40 |         {
41 |             "id": "000e4c4db71278018fb8c322f070d051e76885b1",
42 |             "url": "URL2",
43 |             "name": "NAME2",
44 |         },
45 |     ] == list(db["sources"].rows)
46 |     assert [
47 |         {"id": 1, "source": "d3c1d39c57fecfc09202f20ea5e2db30262029fd"},
48 |         {"id": 2, "source": "000e4c4db71278018fb8c322f070d051e76885b1"},
49 |         {"id": 3, "source": "d3c1d39c57fecfc09202f20ea5e2db30262029fd"},
50 |     ] == list(db["tweets"].rows)
51 | 
52 | 
53 | def test_convert_source_column_against_real_database(db):
54 |     assert "migrations" not in db.table_names()
55 |     migrations.convert_source_column(db)
56 | 


--------------------------------------------------------------------------------
/tests/test_save_tweets.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pathlib
  3 | 
  4 | import pytest
  5 | import sqlite_utils
  6 | from twitter_to_sqlite import utils
  7 | 
  8 | 
  9 | @pytest.fixture
 10 | def tweets():
 11 |     return json.load(open(pathlib.Path(__file__).parent / "tweets.json"))
 12 | 
 13 | 
 14 | @pytest.fixture
 15 | def db(tweets):
 16 |     db = sqlite_utils.Database(memory=True)
 17 |     utils.save_tweets(db, tweets)
 18 |     return db
 19 | 
 20 | 
 21 | def test_tables(db):
 22 |     assert {
 23 |         "sources",
 24 |         "users_fts_idx",
 25 |         "users_fts_data",
 26 |         "tweets_fts",
 27 |         "tweets_fts_idx",
 28 |         "tweets",
 29 |         "users",
 30 |         "places",
 31 |         "following",
 32 |         "tweets_fts_data",
 33 |         "users_fts_config",
 34 |         "users_fts",
 35 |         "tweets_fts_config",
 36 |         "tweets_fts_docsize",
 37 |         "users_fts_docsize",
 38 |         "media",
 39 |         "media_tweets",
 40 |         "since_id_types",
 41 |         "since_ids",
 42 |         "count_history_types",
 43 |         "count_history",
 44 |     } == set(db.table_names())
 45 |     # And check for indexes
 46 |     following_indexes = {tuple(i.columns) for i in db["following"].indexes}
 47 |     assert {
 48 |         ("followed_id", "follower_id"),
 49 |         ("followed_id",),
 50 |         ("follower_id",),
 51 |     } == following_indexes
 52 | 
 53 | 
 54 | def test_users(db):
 55 |     user_rows = list(db["users"].rows)
 56 |     assert [
 57 |         {
 58 |             "id": 12497,
 59 |             "name": "Simon Willison",
 60 |             "screen_name": "simonw",
 61 |             "location": "San Francisco, CA",
 62 |             "description": "Creator of Datasette, co-creator Django. Fellow at @JSKstanford. Usually hanging out with @natbat and @cleopaws. He/Him",
 63 |             "url": "https://simonwillison.net/",
 64 |             "protected": 0,
 65 |             "followers_count": 17754,
 66 |             "friends_count": 3460,
 67 |             "listed_count": 1230,
 68 |             "created_at": "2006-11-15T13:18:50+00:00",
 69 |             "favourites_count": 21506,
 70 |             "utc_offset": None,
 71 |             "time_zone": None,
 72 |             "geo_enabled": 1,
 73 |             "verified": 1,
 74 |             "statuses_count": 17780,
 75 |             "lang": None,
 76 |             "contributors_enabled": 0,
 77 |             "is_translator": 0,
 78 |             "is_translation_enabled": 0,
 79 |             "profile_background_color": "000000",
 80 |             "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
 81 |             "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
 82 |             "profile_background_tile": 0,
 83 |             "profile_image_url": "http://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
 84 |             "profile_image_url_https": "https://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
 85 |             "profile_banner_url": "https://pbs.twimg.com/profile_banners/12497/1347977147",
 86 |             "profile_link_color": "0000FF",
 87 |             "profile_sidebar_border_color": "FFFFFF",
 88 |             "profile_sidebar_fill_color": "FFFFFF",
 89 |             "profile_text_color": "000000",
 90 |             "profile_use_background_image": 1,
 91 |             "has_extended_profile": 1,
 92 |             "default_profile": 0,
 93 |             "default_profile_image": 0,
 94 |             "following": 0,
 95 |             "follow_request_sent": 0,
 96 |             "notifications": 0,
 97 |             "translator_type": "regular",
 98 |         },
 99 |         {
100 |             "id": 14148390,
101 |             "name": "Brian Whitman",
102 |             "screen_name": "bwhitman",
103 |             "location": "Fort Greene NYC",
104 |             "description": "finding the good @ourcanopy with the best people. was CTO/cofounder of Echo Nest, then research @ Spotify. always music",
105 |             "url": "https://notes.variogr.am/about/",
106 |             "protected": 0,
107 |             "followers_count": 4300,
108 |             "friends_count": 639,
109 |             "listed_count": 235,
110 |             "created_at": "2008-03-14T18:19:20+00:00",
111 |             "favourites_count": 8966,
112 |             "utc_offset": None,
113 |             "time_zone": None,
114 |             "geo_enabled": 1,
115 |             "verified": 0,
116 |             "statuses_count": 2192,
117 |             "lang": None,
118 |             "contributors_enabled": 0,
119 |             "is_translator": 0,
120 |             "is_translation_enabled": 0,
121 |             "profile_background_color": "FFFFFF",
122 |             "profile_background_image_url": "http://abs.twimg.com/images/themes/theme13/bg.gif",
123 |             "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme13/bg.gif",
124 |             "profile_background_tile": 0,
125 |             "profile_image_url": "http://pbs.twimg.com/profile_images/742302060/avatars-000000620200-z21ozh-crop_normal.jpeg",
126 |             "profile_image_url_https": "https://pbs.twimg.com/profile_images/742302060/avatars-000000620200-z21ozh-crop_normal.jpeg",
127 |             "profile_banner_url": "https://pbs.twimg.com/profile_banners/14148390/1398269147",
128 |             "profile_link_color": "911A1A",
129 |             "profile_sidebar_border_color": "EEEEEE",
130 |             "profile_sidebar_fill_color": "FFFFFF",
131 |             "profile_text_color": "333333",
132 |             "profile_use_background_image": 0,
133 |             "has_extended_profile": 1,
134 |             "default_profile": 0,
135 |             "default_profile_image": 0,
136 |             "following": 0,
137 |             "follow_request_sent": 0,
138 |             "notifications": 0,
139 |             "translator_type": "none",
140 |         },
141 |         {
142 |             "id": 22737278,
143 |             "name": "David Roberts",
144 |             "screen_name": "drvox",
145 |             "location": "Seattle, WA",
146 |             "description": "Seattleite transplanted from Tennessee; now blogging for http://Vox.com about energy politics. Climate hawk, deficit dove. Not a doctor.",
147 |             "url": "http://www.vox.com/authors/david-roberts",
148 |             "protected": 0,
149 |             "followers_count": 132789,
150 |             "friends_count": 2723,
151 |             "listed_count": 4644,
152 |             "created_at": "2009-03-04T05:14:12+00:00",
153 |             "favourites_count": 26,
154 |             "utc_offset": None,
155 |             "time_zone": None,
156 |             "geo_enabled": 0,
157 |             "verified": 1,
158 |             "statuses_count": 13887,
159 |             "lang": None,
160 |             "contributors_enabled": 0,
161 |             "is_translator": 0,
162 |             "is_translation_enabled": 0,
163 |             "profile_background_color": "022330",
164 |             "profile_background_image_url": "http://abs.twimg.com/images/themes/theme15/bg.png",
165 |             "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme15/bg.png",
166 |             "profile_background_tile": 0,
167 |             "profile_image_url": "http://pbs.twimg.com/profile_images/551076081051004929/2i4QEfn-_normal.jpeg",
168 |             "profile_image_url_https": "https://pbs.twimg.com/profile_images/551076081051004929/2i4QEfn-_normal.jpeg",
169 |             "profile_banner_url": "https://pbs.twimg.com/profile_banners/22737278/1433745271",
170 |             "profile_link_color": "0084B4",
171 |             "profile_sidebar_border_color": "A8C7F7",
172 |             "profile_sidebar_fill_color": "C0DFEC",
173 |             "profile_text_color": "333333",
174 |             "profile_use_background_image": 1,
175 |             "has_extended_profile": 0,
176 |             "default_profile": 0,
177 |             "default_profile_image": 0,
178 |             "following": 1,
179 |             "follow_request_sent": 0,
180 |             "notifications": 0,
181 |             "translator_type": "none",
182 |         },
183 |     ] == user_rows
184 | 
185 | 
186 | def test_tweets(db):
187 |     tweet_rows = list(db["tweets"].rows)
188 |     assert [
189 |         {
190 |             "id": 861696799362478100,
191 |             "user": 14148390,
192 |             "created_at": "2017-05-08T21:38:21+00:00",
193 |             "full_text": "If you use Photos (mac) & Live Photos, run this command to generate a lovely sound collage of where you’ve been https://gist.github.com/bwhitman/5be2f905556a25145dbac74fe4080739",
194 |             "retweeted_status": None,
195 |             "quoted_status": None,
196 |             "place": None,
197 |             "source": "e6528b505bcfd811fdd40ff2d46665dbccba2024",
198 |             "truncated": 0,
199 |             "display_text_range": "[0, 139]",
200 |             "in_reply_to_status_id": None,
201 |             "in_reply_to_user_id": None,
202 |             "in_reply_to_screen_name": None,
203 |             "geo": None,
204 |             "coordinates": None,
205 |             "contributors": None,
206 |             "is_quote_status": 0,
207 |             "retweet_count": 14,
208 |             "favorite_count": 57,
209 |             "favorited": 0,
210 |             "retweeted": 0,
211 |             "possibly_sensitive": 0,
212 |             "lang": "en",
213 |         },
214 |         {
215 |             "id": 1168529001599533000,
216 |             "user": 12497,
217 |             "created_at": "2019-09-02T14:19:58+00:00",
218 |             "full_text": "Finally got around to running this script. It is BRILLIANT - it produces a concatenated .wav file of the audio from every live photo you've ever taken.\n\nNeeds quite a lot of disk space to run - the /tmp/picblast folder can take multiple GB https://twitter.com/bwhitman/status/861696799362478085",
219 |             "retweeted_status": None,
220 |             "quoted_status": 861696799362478100,
221 |             "place": None,
222 |             "source": "1f89d6a41b1505a3071169f8d0d028ba9ad6f952",
223 |             "truncated": 0,
224 |             "display_text_range": "[0, 239]",
225 |             "in_reply_to_status_id": None,
226 |             "in_reply_to_user_id": None,
227 |             "in_reply_to_screen_name": None,
228 |             "geo": None,
229 |             "coordinates": None,
230 |             "contributors": None,
231 |             "is_quote_status": 1,
232 |             "retweet_count": 4,
233 |             "favorite_count": 31,
234 |             "favorited": 0,
235 |             "retweeted": 0,
236 |             "possibly_sensitive": 0,
237 |             "lang": "en",
238 |         },
239 |         {
240 |             "id": 1169196446043664400,
241 |             "user": 12497,
242 |             "created_at": "2019-09-04T10:32:10+00:00",
243 |             "full_text": "@scientiffic @Wikipedia @unsplash @cagarrity The @inaturalist API is amazingly powerful and fun with no auth and no rate limit. We used it to build http://www.owlsnearme.com - see also @Natbat's great tutorial on using it with @observablehq https://24ways.org/2018/observable-notebooks-and-inaturalist/",
244 |             "retweeted_status": None,
245 |             "quoted_status": None,
246 |             "place": "01a9a39529b27f36",
247 |             "source": None,
248 |             "truncated": 0,
249 |             "display_text_range": "[45, 262]",
250 |             "in_reply_to_status_id": "1169079390577320000",
251 |             "in_reply_to_user_id": "82016165",
252 |             "in_reply_to_screen_name": "scientiffic",
253 |             "geo": None,
254 |             "coordinates": None,
255 |             "contributors": None,
256 |             "is_quote_status": 0,
257 |             "retweet_count": 0,
258 |             "favorite_count": 2,
259 |             "favorited": 0,
260 |             "retweeted": 0,
261 |             "possibly_sensitive": 0,
262 |             "lang": "en",
263 |         },
264 |         {
265 |             "id": 1169242008432644000,
266 |             "user": 22737278,
267 |             "created_at": "2019-09-04T13:33:12+00:00",
268 |             "full_text": "My new post: an explainer on “carbon capture & utilization” (CCU). CO2 captured from waste gases or the ambient air can be used to make valuable products. Could CCU help the carbon capture industry scale up? https://www.vox.com/energy-and-environment/2019/9/4/20829431/climate-change-carbon-capture-utilization-sequestration-ccu-ccs?utm_campaign=drvox&utm_content=chorus&utm_medium=social&utm_source=twitter",
269 |             "retweeted_status": None,
270 |             "quoted_status": None,
271 |             "place": None,
272 |             "source": "942cfc2bf9f290ddbe3d78f1907dc084a00ed23f",
273 |             "truncated": 0,
274 |             "display_text_range": "[0, 235]",
275 |             "in_reply_to_status_id": None,
276 |             "in_reply_to_user_id": None,
277 |             "in_reply_to_screen_name": None,
278 |             "geo": None,
279 |             "coordinates": None,
280 |             "contributors": None,
281 |             "is_quote_status": 0,
282 |             "retweet_count": 42,
283 |             "favorite_count": 86,
284 |             "favorited": 1,
285 |             "retweeted": 1,
286 |             "possibly_sensitive": 0,
287 |             "lang": "en",
288 |         },
289 |         {
290 |             "id": 1169246717864136700,
291 |             "user": 12497,
292 |             "created_at": "2019-09-04T13:51:55+00:00",
293 |             "full_text": "RT @drvox: My new post: an explainer on “carbon capture & utilization” (CCU). CO2 captured from waste gases or the ambient air can be used…",
294 |             "retweeted_status": 1169242008432644000,
295 |             "quoted_status": None,
296 |             "place": None,
297 |             "source": "95f3aaaddaa45937ac94765e0ddb68ba2be92d20",
298 |             "truncated": 0,
299 |             "display_text_range": "[0, 143]",
300 |             "in_reply_to_status_id": None,
301 |             "in_reply_to_user_id": None,
302 |             "in_reply_to_screen_name": None,
303 |             "geo": None,
304 |             "coordinates": None,
305 |             "contributors": None,
306 |             "is_quote_status": 0,
307 |             "retweet_count": 42,
308 |             "favorite_count": 0,
309 |             "favorited": 1,
310 |             "retweeted": 1,
311 |             "possibly_sensitive": None,
312 |             "lang": "en",
313 |         },
314 |     ] == tweet_rows
315 | 
316 | 
317 | def test_sources(db):
318 |     source_rows = list(db["sources"].rows)
319 |     assert [
320 |         {
321 |             "id": "95f3aaaddaa45937ac94765e0ddb68ba2be92d20",
322 |             "name": "Twitter for iPhone",
323 |             "url": "http://twitter.com/download/iphone",
324 |         },
325 |         {
326 |             "id": "942cfc2bf9f290ddbe3d78f1907dc084a00ed23f",
327 |             "name": "Vox Media",
328 |             "url": "http://www.voxmedia.com",
329 |         },
330 |         {
331 |             "id": "1f89d6a41b1505a3071169f8d0d028ba9ad6f952",
332 |             "name": "Twitter Web App",
333 |             "url": "https://mobile.twitter.com",
334 |         },
335 |         {
336 |             "id": "e6528b505bcfd811fdd40ff2d46665dbccba2024",
337 |             "name": "Twitter for Mac",
338 |             "url": "http://itunes.apple.com/us/app/twitter/id409789998?mt=12",
339 |         },
340 |     ] == source_rows
341 | 
342 | 
343 | def test_places(db):
344 |     place_rows = list(db["places"].rows)
345 |     assert [
346 |         {
347 |             "id": "01a9a39529b27f36",
348 |             "url": "https://api.twitter.com/1.1/geo/id/01a9a39529b27f36.json",
349 |             "place_type": "city",
350 |             "name": "Manhattan",
351 |             "full_name": "Manhattan, NY",
352 |             "country_code": "US",
353 |             "country": "United States",
354 |             "contained_within": "[]",
355 |             "bounding_box": '{"type": "Polygon", "coordinates": [[[-74.026675, 40.683935], [-73.910408, 40.683935], [-73.910408, 40.877483], [-74.026675, 40.877483]]]}',
356 |             "attributes": "{}",
357 |         }
358 |     ] == place_rows
359 | 
360 | 
361 | def test_media(db):
362 |     media_rows = list(db["media"].rows)
363 |     media_tweets_rows = list(db["media_tweets"].rows)
364 |     assert [
365 |         {
366 |             "id": 504727051174031360,
367 |             "id_str": "504727051174031361",
368 |             "indices": "[116, 138]",
369 |             "media_url": "http://pbs.twimg.com/media/BwEmlDHCMAEGwu_.jpg",
370 |             "media_url_https": "https://pbs.twimg.com/media/BwEmlDHCMAEGwu_.jpg",
371 |             "url": "http://t.co/uavPaDsvNe",
372 |             "display_url": "pic.twitter.com/uavPaDsvNe",
373 |             "expanded_url": "https://twitter.com/UpturnedBathtub/status/504727120812453889/photo/1",
374 |             "type": "photo",
375 |             "sizes": '{"thumb": {"w": 150, "h": 150, "resize": "crop"}, "large": {"w": 1024, "h": 768, "resize": "fit"}, "medium": {"w": 1024, "h": 768, "resize": "fit"}, "small": {"w": 680, "h": 510, "resize": "fit"}}',
376 |         }
377 |     ] == media_rows
378 |     assert [
379 |         {"media_id": 504727051174031360, "tweets_id": 1169196446043664400}
380 |     ] == media_tweets_rows
381 | 


--------------------------------------------------------------------------------
/tests/tweets.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "created_at": "Wed Sep 04 13:51:55 +0000 2019",
  4 |         "id": 1169246717864136700,
  5 |         "id_str": "1169246717864136705",
  6 |         "full_text": "RT @drvox: My new post: an explainer on “carbon capture &amp; utilization” (CCU). CO2 captured from waste gases or the ambient air can be used…",
  7 |         "truncated": false,
  8 |         "display_text_range": [
  9 |             0,
 10 |             143
 11 |         ],
 12 |         "entities": {
 13 |             "hashtags": [],
 14 |             "symbols": [],
 15 |             "user_mentions": [
 16 |                 {
 17 |                     "screen_name": "drvox",
 18 |                     "name": "David Roberts",
 19 |                     "id": 22737278,
 20 |                     "id_str": "22737278",
 21 |                     "indices": [
 22 |                         3,
 23 |                         9
 24 |                     ]
 25 |                 }
 26 |             ],
 27 |             "urls": []
 28 |         },
 29 |         "source": "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>",
 30 |         "in_reply_to_status_id": null,
 31 |         "in_reply_to_status_id_str": null,
 32 |         "in_reply_to_user_id": null,
 33 |         "in_reply_to_user_id_str": null,
 34 |         "in_reply_to_screen_name": null,
 35 |         "user": {
 36 |             "id": 12497,
 37 |             "id_str": "12497",
 38 |             "name": "Simon Willison",
 39 |             "screen_name": "simonw",
 40 |             "location": "San Francisco, CA",
 41 |             "description": "Creator of Datasette, co-creator Django. Fellow at @JSKstanford. Usually hanging out with @natbat and @cleopaws. He/Him",
 42 |             "url": "https://t.co/wyNggeHZ8W",
 43 |             "entities": {
 44 |                 "url": {
 45 |                     "urls": [
 46 |                         {
 47 |                             "url": "https://t.co/wyNggeHZ8W",
 48 |                             "expanded_url": "https://simonwillison.net/",
 49 |                             "display_url": "simonwillison.net",
 50 |                             "indices": [
 51 |                                 0,
 52 |                                 23
 53 |                             ]
 54 |                         }
 55 |                     ]
 56 |                 },
 57 |                 "description": {
 58 |                     "urls": []
 59 |                 }
 60 |             },
 61 |             "protected": false,
 62 |             "followers_count": 17754,
 63 |             "friends_count": 3460,
 64 |             "listed_count": 1230,
 65 |             "created_at": "Wed Nov 15 13:18:50 +0000 2006",
 66 |             "favourites_count": 21506,
 67 |             "utc_offset": null,
 68 |             "time_zone": null,
 69 |             "geo_enabled": true,
 70 |             "verified": true,
 71 |             "statuses_count": 17780,
 72 |             "lang": null,
 73 |             "contributors_enabled": false,
 74 |             "is_translator": false,
 75 |             "is_translation_enabled": false,
 76 |             "profile_background_color": "000000",
 77 |             "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
 78 |             "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
 79 |             "profile_background_tile": false,
 80 |             "profile_image_url": "http://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
 81 |             "profile_image_url_https": "https://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
 82 |             "profile_banner_url": "https://pbs.twimg.com/profile_banners/12497/1347977147",
 83 |             "profile_link_color": "0000FF",
 84 |             "profile_sidebar_border_color": "FFFFFF",
 85 |             "profile_sidebar_fill_color": "FFFFFF",
 86 |             "profile_text_color": "000000",
 87 |             "profile_use_background_image": true,
 88 |             "has_extended_profile": true,
 89 |             "default_profile": false,
 90 |             "default_profile_image": false,
 91 |             "following": false,
 92 |             "follow_request_sent": false,
 93 |             "notifications": false,
 94 |             "translator_type": "regular"
 95 |         },
 96 |         "geo": null,
 97 |         "coordinates": null,
 98 |         "place": null,
 99 |         "contributors": null,
100 |         "retweeted_status": {
101 |             "created_at": "Wed Sep 04 13:33:12 +0000 2019",
102 |             "id": 1169242008432644000,
103 |             "id_str": "1169242008432644097",
104 |             "full_text": "My new post: an explainer on “carbon capture &amp; utilization” (CCU). CO2 captured from waste gases or the ambient air can be used to make valuable products. Could CCU help the carbon capture industry scale up? https://t.co/cVDz7Xxi4E",
105 |             "truncated": false,
106 |             "display_text_range": [
107 |                 0,
108 |                 235
109 |             ],
110 |             "entities": {
111 |                 "hashtags": [],
112 |                 "symbols": [],
113 |                 "user_mentions": [],
114 |                 "urls": [
115 |                     {
116 |                         "url": "https://t.co/cVDz7Xxi4E",
117 |                         "expanded_url": "https://www.vox.com/energy-and-environment/2019/9/4/20829431/climate-change-carbon-capture-utilization-sequestration-ccu-ccs?utm_campaign=drvox&utm_content=chorus&utm_medium=social&utm_source=twitter",
118 |                         "display_url": "vox.com/energy-and-env…",
119 |                         "indices": [
120 |                             212,
121 |                             235
122 |                         ]
123 |                     }
124 |                 ]
125 |             },
126 |             "source": "<a href=\"http://www.voxmedia.com\" rel=\"nofollow\">Vox Media</a>",
127 |             "in_reply_to_status_id": null,
128 |             "in_reply_to_status_id_str": null,
129 |             "in_reply_to_user_id": null,
130 |             "in_reply_to_user_id_str": null,
131 |             "in_reply_to_screen_name": null,
132 |             "user": {
133 |                 "id": 22737278,
134 |                 "id_str": "22737278",
135 |                 "name": "David Roberts",
136 |                 "screen_name": "drvox",
137 |                 "location": "Seattle, WA",
138 |                 "description": "Seattleite transplanted from Tennessee; now blogging for https://t.co/5gESirnht7 about energy politics. Climate hawk, deficit dove. Not a doctor.",
139 |                 "url": "http://t.co/AMWwRyre24",
140 |                 "entities": {
141 |                     "url": {
142 |                         "urls": [
143 |                             {
144 |                                 "url": "http://t.co/AMWwRyre24",
145 |                                 "expanded_url": "http://www.vox.com/authors/david-roberts",
146 |                                 "display_url": "vox.com/authors/david-…",
147 |                                 "indices": [
148 |                                     0,
149 |                                     22
150 |                                 ]
151 |                             }
152 |                         ]
153 |                     },
154 |                     "description": {
155 |                         "urls": [
156 |                             {
157 |                                 "url": "https://t.co/5gESirnht7",
158 |                                 "expanded_url": "http://Vox.com",
159 |                                 "display_url": "Vox.com",
160 |                                 "indices": [
161 |                                     57,
162 |                                     80
163 |                                 ]
164 |                             }
165 |                         ]
166 |                     }
167 |                 },
168 |                 "protected": false,
169 |                 "followers_count": 132789,
170 |                 "friends_count": 2723,
171 |                 "listed_count": 4644,
172 |                 "created_at": "Wed Mar 04 05:14:12 +0000 2009",
173 |                 "favourites_count": 26,
174 |                 "utc_offset": null,
175 |                 "time_zone": null,
176 |                 "geo_enabled": false,
177 |                 "verified": true,
178 |                 "statuses_count": 13887,
179 |                 "lang": null,
180 |                 "contributors_enabled": false,
181 |                 "is_translator": false,
182 |                 "is_translation_enabled": false,
183 |                 "profile_background_color": "022330",
184 |                 "profile_background_image_url": "http://abs.twimg.com/images/themes/theme15/bg.png",
185 |                 "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme15/bg.png",
186 |                 "profile_background_tile": false,
187 |                 "profile_image_url": "http://pbs.twimg.com/profile_images/551076081051004929/2i4QEfn-_normal.jpeg",
188 |                 "profile_image_url_https": "https://pbs.twimg.com/profile_images/551076081051004929/2i4QEfn-_normal.jpeg",
189 |                 "profile_banner_url": "https://pbs.twimg.com/profile_banners/22737278/1433745271",
190 |                 "profile_link_color": "0084B4",
191 |                 "profile_sidebar_border_color": "A8C7F7",
192 |                 "profile_sidebar_fill_color": "C0DFEC",
193 |                 "profile_text_color": "333333",
194 |                 "profile_use_background_image": true,
195 |                 "has_extended_profile": false,
196 |                 "default_profile": false,
197 |                 "default_profile_image": false,
198 |                 "following": true,
199 |                 "follow_request_sent": false,
200 |                 "notifications": false,
201 |                 "translator_type": "none"
202 |             },
203 |             "geo": null,
204 |             "coordinates": null,
205 |             "place": null,
206 |             "contributors": null,
207 |             "is_quote_status": false,
208 |             "retweet_count": 42,
209 |             "favorite_count": 86,
210 |             "favorited": true,
211 |             "retweeted": true,
212 |             "possibly_sensitive": false,
213 |             "lang": "en"
214 |         },
215 |         "is_quote_status": false,
216 |         "retweet_count": 42,
217 |         "favorite_count": 0,
218 |         "favorited": true,
219 |         "retweeted": true,
220 |         "lang": "en"
221 |     },
222 |     {
223 |         "created_at": "Wed Sep 04 10:32:10 +0000 2019",
224 |         "id": 1169196446043664400,
225 |         "id_str": "1169196446043664385",
226 |         "full_text": "@scientiffic @Wikipedia @unsplash @cagarrity The @inaturalist API is amazingly powerful and fun with no auth and no rate limit. We used it to build https://t.co/q17EOpkGc3 - see also @Natbat's great tutorial on using it with @observablehq https://t.co/WbYktnYxBc",
227 |         "truncated": false,
228 |         "display_text_range": [
229 |             45,
230 |             262
231 |         ],
232 |         "extended_entities": {
233 |             "media": [
234 |                 {
235 |                     "id": 504727051174031360,
236 |                     "id_str": "504727051174031361",
237 |                     "indices": [
238 |                         116,
239 |                         138
240 |                     ],
241 |                     "media_url": "http://pbs.twimg.com/media/BwEmlDHCMAEGwu_.jpg",
242 |                     "media_url_https": "https://pbs.twimg.com/media/BwEmlDHCMAEGwu_.jpg",
243 |                     "url": "http://t.co/uavPaDsvNe",
244 |                     "display_url": "pic.twitter.com/uavPaDsvNe",
245 |                     "expanded_url": "https://twitter.com/UpturnedBathtub/status/504727120812453889/photo/1",
246 |                     "type": "photo",
247 |                     "sizes": {
248 |                         "thumb": {
249 |                             "w": 150,
250 |                             "h": 150,
251 |                             "resize": "crop"
252 |                         },
253 |                         "large": {
254 |                             "w": 1024,
255 |                             "h": 768,
256 |                             "resize": "fit"
257 |                         },
258 |                         "medium": {
259 |                             "w": 1024,
260 |                             "h": 768,
261 |                             "resize": "fit"
262 |                         },
263 |                         "small": {
264 |                             "w": 680,
265 |                             "h": 510,
266 |                             "resize": "fit"
267 |                         }
268 |                     }
269 |                 }
270 |             ]
271 |         },
272 |         "entities": {
273 |             "hashtags": [],
274 |             "symbols": [],
275 |             "user_mentions": [
276 |                 {
277 |                     "screen_name": "scientiffic",
278 |                     "name": "Tiffany Tseng 🍡",
279 |                     "id": 82016165,
280 |                     "id_str": "82016165",
281 |                     "indices": [
282 |                         0,
283 |                         12
284 |                     ]
285 |                 },
286 |                 {
287 |                     "screen_name": "Wikipedia",
288 |                     "name": "Wikipedia",
289 |                     "id": 86390214,
290 |                     "id_str": "86390214",
291 |                     "indices": [
292 |                         13,
293 |                         23
294 |                     ]
295 |                 },
296 |                 {
297 |                     "screen_name": "unsplash",
298 |                     "name": "Unsplash",
299 |                     "id": 1520228526,
300 |                     "id_str": "1520228526",
301 |                     "indices": [
302 |                         24,
303 |                         33
304 |                     ]
305 |                 },
306 |                 {
307 |                     "screen_name": "cagarrity",
308 |                     "name": "Chris Garrity",
309 |                     "id": 92321453,
310 |                     "id_str": "92321453",
311 |                     "indices": [
312 |                         34,
313 |                         44
314 |                     ]
315 |                 },
316 |                 {
317 |                     "screen_name": "inaturalist",
318 |                     "name": "iNaturalist",
319 |                     "id": 14239043,
320 |                     "id_str": "14239043",
321 |                     "indices": [
322 |                         49,
323 |                         61
324 |                     ]
325 |                 },
326 |                 {
327 |                     "screen_name": "Natbat",
328 |                     "name": "Natbat",
329 |                     "id": 12161,
330 |                     "id_str": "12161",
331 |                     "indices": [
332 |                         183,
333 |                         190
334 |                     ]
335 |                 },
336 |                 {
337 |                     "screen_name": "observablehq",
338 |                     "name": "Observable",
339 |                     "id": 905255756789825500,
340 |                     "id_str": "905255756789825536",
341 |                     "indices": [
342 |                         225,
343 |                         238
344 |                     ]
345 |                 }
346 |             ],
347 |             "urls": [
348 |                 {
349 |                     "url": "https://t.co/q17EOpkGc3",
350 |                     "expanded_url": "http://www.owlsnearme.com",
351 |                     "display_url": "owlsnearme.com",
352 |                     "indices": [
353 |                         148,
354 |                         171
355 |                     ]
356 |                 },
357 |                 {
358 |                     "url": "https://t.co/WbYktnYxBc",
359 |                     "expanded_url": "https://24ways.org/2018/observable-notebooks-and-inaturalist/",
360 |                     "display_url": "24ways.org/2018/observabl…",
361 |                     "indices": [
362 |                         239,
363 |                         262
364 |                     ]
365 |                 }
366 |             ]
367 |         },
368 |         "source": "",
369 |         "in_reply_to_status_id": 1169079390577320000,
370 |         "in_reply_to_status_id_str": "1169079390577319937",
371 |         "in_reply_to_user_id": 82016165,
372 |         "in_reply_to_user_id_str": "82016165",
373 |         "in_reply_to_screen_name": "scientiffic",
374 |         "user": {
375 |             "id": 12497,
376 |             "id_str": "12497",
377 |             "name": "Simon Willison",
378 |             "screen_name": "simonw",
379 |             "location": "San Francisco, CA",
380 |             "description": "Creator of Datasette, co-creator Django. Fellow at @JSKstanford. Usually hanging out with @natbat and @cleopaws. He/Him",
381 |             "url": "https://t.co/wyNggeHZ8W",
382 |             "entities": {
383 |                 "url": {
384 |                     "urls": [
385 |                         {
386 |                             "url": "https://t.co/wyNggeHZ8W",
387 |                             "expanded_url": "https://simonwillison.net/",
388 |                             "display_url": "simonwillison.net",
389 |                             "indices": [
390 |                                 0,
391 |                                 23
392 |                             ]
393 |                         }
394 |                     ]
395 |                 },
396 |                 "description": {
397 |                     "urls": []
398 |                 }
399 |             },
400 |             "protected": false,
401 |             "followers_count": 17754,
402 |             "friends_count": 3460,
403 |             "listed_count": 1230,
404 |             "created_at": "Wed Nov 15 13:18:50 +0000 2006",
405 |             "favourites_count": 21506,
406 |             "utc_offset": null,
407 |             "time_zone": null,
408 |             "geo_enabled": true,
409 |             "verified": true,
410 |             "statuses_count": 17780,
411 |             "lang": null,
412 |             "contributors_enabled": false,
413 |             "is_translator": false,
414 |             "is_translation_enabled": false,
415 |             "profile_background_color": "000000",
416 |             "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
417 |             "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
418 |             "profile_background_tile": false,
419 |             "profile_image_url": "http://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
420 |             "profile_image_url_https": "https://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
421 |             "profile_banner_url": "https://pbs.twimg.com/profile_banners/12497/1347977147",
422 |             "profile_link_color": "0000FF",
423 |             "profile_sidebar_border_color": "FFFFFF",
424 |             "profile_sidebar_fill_color": "FFFFFF",
425 |             "profile_text_color": "000000",
426 |             "profile_use_background_image": true,
427 |             "has_extended_profile": true,
428 |             "default_profile": false,
429 |             "default_profile_image": false,
430 |             "following": false,
431 |             "follow_request_sent": false,
432 |             "notifications": false,
433 |             "translator_type": "regular"
434 |         },
435 |         "geo": null,
436 |         "coordinates": null,
437 |         "place": {
438 |             "id": "01a9a39529b27f36",
439 |             "url": "https://api.twitter.com/1.1/geo/id/01a9a39529b27f36.json",
440 |             "place_type": "city",
441 |             "name": "Manhattan",
442 |             "full_name": "Manhattan, NY",
443 |             "country_code": "US",
444 |             "country": "United States",
445 |             "contained_within": [],
446 |             "bounding_box": {
447 |                 "type": "Polygon",
448 |                 "coordinates": [
449 |                     [
450 |                         [
451 |                             -74.026675,
452 |                             40.683935
453 |                         ],
454 |                         [
455 |                             -73.910408,
456 |                             40.683935
457 |                         ],
458 |                         [
459 |                             -73.910408,
460 |                             40.877483
461 |                         ],
462 |                         [
463 |                             -74.026675,
464 |                             40.877483
465 |                         ]
466 |                     ]
467 |                 ]
468 |             },
469 |             "attributes": {}
470 |         },
471 |         "contributors": null,
472 |         "is_quote_status": false,
473 |         "retweet_count": 0,
474 |         "favorite_count": 2,
475 |         "favorited": false,
476 |         "retweeted": false,
477 |         "possibly_sensitive": false,
478 |         "lang": "en"
479 |     },
480 |     {
481 |         "created_at": "Mon Sep 02 14:19:58 +0000 2019",
482 |         "id": 1168529001599533000,
483 |         "id_str": "1168529001599533057",
484 |         "full_text": "Finally got around to running this script. It is BRILLIANT - it produces a concatenated .wav file of the audio from every live photo you've ever taken.\n\nNeeds quite a lot of disk space to run - the /tmp/picblast folder can take multiple GB https://t.co/AJNTJbhF0g",
485 |         "truncated": false,
486 |         "display_text_range": [
487 |             0,
488 |             239
489 |         ],
490 |         "entities": {
491 |             "hashtags": [],
492 |             "symbols": [],
493 |             "user_mentions": [],
494 |             "urls": [
495 |                 {
496 |                     "url": "https://t.co/AJNTJbhF0g",
497 |                     "expanded_url": "https://twitter.com/bwhitman/status/861696799362478085",
498 |                     "display_url": "twitter.com/bwhitman/statu…",
499 |                     "indices": [
500 |                         240,
501 |                         263
502 |                     ]
503 |                 }
504 |             ]
505 |         },
506 |         "source": "<a href=\"https://mobile.twitter.com\" rel=\"nofollow\">Twitter Web App</a>",
507 |         "in_reply_to_status_id": null,
508 |         "in_reply_to_status_id_str": null,
509 |         "in_reply_to_user_id": null,
510 |         "in_reply_to_user_id_str": null,
511 |         "in_reply_to_screen_name": null,
512 |         "user": {
513 |             "id": 12497,
514 |             "id_str": "12497",
515 |             "name": "Simon Willison",
516 |             "screen_name": "simonw",
517 |             "location": "San Francisco, CA",
518 |             "description": "Creator of Datasette, co-creator Django. Fellow at @JSKstanford. Usually hanging out with @natbat and @cleopaws. He/Him",
519 |             "url": "https://t.co/wyNggeHZ8W",
520 |             "entities": {
521 |                 "url": {
522 |                     "urls": [
523 |                         {
524 |                             "url": "https://t.co/wyNggeHZ8W",
525 |                             "expanded_url": "https://simonwillison.net/",
526 |                             "display_url": "simonwillison.net",
527 |                             "indices": [
528 |                                 0,
529 |                                 23
530 |                             ]
531 |                         }
532 |                     ]
533 |                 },
534 |                 "description": {
535 |                     "urls": []
536 |                 }
537 |             },
538 |             "protected": false,
539 |             "followers_count": 17754,
540 |             "friends_count": 3460,
541 |             "listed_count": 1230,
542 |             "created_at": "Wed Nov 15 13:18:50 +0000 2006",
543 |             "favourites_count": 21506,
544 |             "utc_offset": null,
545 |             "time_zone": null,
546 |             "geo_enabled": true,
547 |             "verified": true,
548 |             "statuses_count": 17780,
549 |             "lang": null,
550 |             "contributors_enabled": false,
551 |             "is_translator": false,
552 |             "is_translation_enabled": false,
553 |             "profile_background_color": "000000",
554 |             "profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
555 |             "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
556 |             "profile_background_tile": false,
557 |             "profile_image_url": "http://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
558 |             "profile_image_url_https": "https://pbs.twimg.com/profile_images/378800000261649705/be9cc55e64014e6d7663c50d7cb9fc75_normal.jpeg",
559 |             "profile_banner_url": "https://pbs.twimg.com/profile_banners/12497/1347977147",
560 |             "profile_link_color": "0000FF",
561 |             "profile_sidebar_border_color": "FFFFFF",
562 |             "profile_sidebar_fill_color": "FFFFFF",
563 |             "profile_text_color": "000000",
564 |             "profile_use_background_image": true,
565 |             "has_extended_profile": true,
566 |             "default_profile": false,
567 |             "default_profile_image": false,
568 |             "following": false,
569 |             "follow_request_sent": false,
570 |             "notifications": false,
571 |             "translator_type": "regular"
572 |         },
573 |         "geo": null,
574 |         "coordinates": null,
575 |         "place": null,
576 |         "contributors": null,
577 |         "is_quote_status": true,
578 |         "quoted_status_id": 861696799362478100,
579 |         "quoted_status_id_str": "861696799362478085",
580 |         "quoted_status_permalink": {
581 |             "url": "https://t.co/AJNTJbhF0g",
582 |             "expanded": "https://twitter.com/bwhitman/status/861696799362478085",
583 |             "display": "twitter.com/bwhitman/statu…"
584 |         },
585 |         "quoted_status": {
586 |             "created_at": "Mon May 08 21:38:21 +0000 2017",
587 |             "id": 861696799362478100,
588 |             "id_str": "861696799362478085",
589 |             "full_text": "If you use Photos (mac) &amp; Live Photos, run this command to generate a lovely sound collage of where you’ve been https://t.co/cEbhE4P3ZM",
590 |             "truncated": false,
591 |             "display_text_range": [
592 |                 0,
593 |                 139
594 |             ],
595 |             "entities": {
596 |                 "hashtags": [],
597 |                 "symbols": [],
598 |                 "user_mentions": [],
599 |                 "urls": [
600 |                     {
601 |                         "url": "https://t.co/cEbhE4P3ZM",
602 |                         "expanded_url": "https://gist.github.com/bwhitman/5be2f905556a25145dbac74fe4080739",
603 |                         "display_url": "gist.github.com/bwhitman/5be2f…",
604 |                         "indices": [
605 |                             116,
606 |                             139
607 |                         ]
608 |                     }
609 |                 ]
610 |             },
611 |             "source": "<a href=\"http://itunes.apple.com/us/app/twitter/id409789998?mt=12\" rel=\"nofollow\">Twitter for Mac</a>",
612 |             "in_reply_to_status_id": null,
613 |             "in_reply_to_status_id_str": null,
614 |             "in_reply_to_user_id": null,
615 |             "in_reply_to_user_id_str": null,
616 |             "in_reply_to_screen_name": null,
617 |             "user": {
618 |                 "id": 14148390,
619 |                 "id_str": "14148390",
620 |                 "name": "Brian Whitman",
621 |                 "screen_name": "bwhitman",
622 |                 "location": "Fort Greene NYC",
623 |                 "description": "finding the good @ourcanopy with the best people. was CTO/cofounder of Echo Nest, then research @ Spotify. always music",
624 |                 "url": "https://t.co/S9eq2BkZsn",
625 |                 "entities": {
626 |                     "url": {
627 |                         "urls": [
628 |                             {
629 |                                 "url": "https://t.co/S9eq2BkZsn",
630 |                                 "expanded_url": "https://notes.variogr.am/about/",
631 |                                 "display_url": "notes.variogr.am/about/",
632 |                                 "indices": [
633 |                                     0,
634 |                                     23
635 |                                 ]
636 |                             }
637 |                         ]
638 |                     },
639 |                     "description": {
640 |                         "urls": []
641 |                     }
642 |                 },
643 |                 "protected": false,
644 |                 "followers_count": 4300,
645 |                 "friends_count": 639,
646 |                 "listed_count": 235,
647 |                 "created_at": "Fri Mar 14 18:19:20 +0000 2008",
648 |                 "favourites_count": 8966,
649 |                 "utc_offset": null,
650 |                 "time_zone": null,
651 |                 "geo_enabled": true,
652 |                 "verified": false,
653 |                 "statuses_count": 2192,
654 |                 "lang": null,
655 |                 "contributors_enabled": false,
656 |                 "is_translator": false,
657 |                 "is_translation_enabled": false,
658 |                 "profile_background_color": "FFFFFF",
659 |                 "profile_background_image_url": "http://abs.twimg.com/images/themes/theme13/bg.gif",
660 |                 "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme13/bg.gif",
661 |                 "profile_background_tile": false,
662 |                 "profile_image_url": "http://pbs.twimg.com/profile_images/742302060/avatars-000000620200-z21ozh-crop_normal.jpeg",
663 |                 "profile_image_url_https": "https://pbs.twimg.com/profile_images/742302060/avatars-000000620200-z21ozh-crop_normal.jpeg",
664 |                 "profile_banner_url": "https://pbs.twimg.com/profile_banners/14148390/1398269147",
665 |                 "profile_link_color": "911A1A",
666 |                 "profile_sidebar_border_color": "EEEEEE",
667 |                 "profile_sidebar_fill_color": "FFFFFF",
668 |                 "profile_text_color": "333333",
669 |                 "profile_use_background_image": false,
670 |                 "has_extended_profile": true,
671 |                 "default_profile": false,
672 |                 "default_profile_image": false,
673 |                 "following": false,
674 |                 "follow_request_sent": false,
675 |                 "notifications": false,
676 |                 "translator_type": "none"
677 |             },
678 |             "geo": null,
679 |             "coordinates": null,
680 |             "place": null,
681 |             "contributors": null,
682 |             "is_quote_status": false,
683 |             "retweet_count": 14,
684 |             "favorite_count": 57,
685 |             "favorited": false,
686 |             "retweeted": false,
687 |             "possibly_sensitive": false,
688 |             "lang": "en"
689 |         },
690 |         "retweet_count": 4,
691 |         "favorite_count": 31,
692 |         "favorited": false,
693 |         "retweeted": false,
694 |         "possibly_sensitive": false,
695 |         "lang": "en"
696 |     }
697 | ]


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import zipfile
 3 | 
 4 | 
 5 | def create_zip(path, buf=None):
 6 |     if buf is None:
 7 |         buf = io.BytesIO()
 8 |     zf = zipfile.ZipFile(buf, "w")
 9 |     for filepath in path.glob("**/*"):
10 |         if filepath.is_file():
11 |             zf.write(filepath, str(filepath.relative_to(path)))
12 |     return zf
13 | 


--------------------------------------------------------------------------------
/tests/zip_contents/account-suspension.js:
--------------------------------------------------------------------------------
1 | window.YTD.account_suspension.part0 = []
2 | 


--------------------------------------------------------------------------------
/tests/zip_contents/account.js:
--------------------------------------------------------------------------------
 1 | window.YTD.account.part0 = [ {
 2 |   "account" : {
 3 |     "phoneNumber" : "+15555555555",
 4 |     "email" : "swillison@example.com",
 5 |     "createdVia" : "web",
 6 |     "username" : "simonw",
 7 |     "accountId" : "12497",
 8 |     "createdAt" : "2006-11-15T13:18:50.000Z",
 9 |     "accountDisplayName" : "Simon Willison"
10 |   }
11 | } ]


--------------------------------------------------------------------------------
/tests/zip_contents/app.js:
--------------------------------------------------------------------------------
 1 | window.YTD.app.part0 = [
 2 |   {
 3 |     "app" : {
 4 |       "appId" : "1380676511",
 5 |       "appNames" : [
 6 |         "BBC Sounds"
 7 |       ]
 8 |     }
 9 |   }
10 | ]


--------------------------------------------------------------------------------
/tests/zip_contents/follower.js:
--------------------------------------------------------------------------------
1 | window.YTD.follower.part0 = [ {
2 |   "follower" : {
3 |     "accountId" : "73747798"
4 |   }
5 | }, {
6 |   "follower" : {
7 |     "accountId" : "386025404"
8 |   }
9 | } ]


--------------------------------------------------------------------------------
/tests/zip_contents/following.js:
--------------------------------------------------------------------------------
1 | window.YTD.following.part0 = [ {
2 |   "following" : {
3 |     "accountId" : "547842573"
4 |   }
5 | }, {
6 |   "following" : {
7 |     "accountId" : "12158"
8 |   }
9 | } ]


--------------------------------------------------------------------------------
/tests/zip_contents/saved-search.js:
--------------------------------------------------------------------------------
 1 | window.YTD.saved_search.part0 = [ {
 2 |   "savedSearch" : {
 3 |     "savedSearchId" : "42214",
 4 |     "query" : "simonw"
 5 |   }
 6 | }, {
 7 |   "savedSearch" : {
 8 |     "savedSearchId" : "55814",
 9 |     "query" : "django"
10 |   }
11 | } ]


--------------------------------------------------------------------------------
/twitter_to_sqlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dogsheep/twitter-to-sqlite/f09d611782a8372cfb002792dfa727325afb4db6/twitter_to_sqlite/__init__.py


--------------------------------------------------------------------------------
/twitter_to_sqlite/archive.py:
--------------------------------------------------------------------------------
  1 | # Utilities for dealing with Twitter archives
  2 | import json
  3 | 
  4 | # Goal is to have a mapping of filename to a tuple with
  5 | # (callable, pk=) triples, where the callable
  6 | # takes the JSON from that file and returns a dictionary
  7 | # of tables that should be created {"tabe": [rows-to-upsert]}
  8 | transformers = {}
  9 | 
 10 | # These files are deliberately ignored
 11 | IGNORE = {"manifest"}
 12 | 
 13 | 
 14 | def register(filename, each, pk=None):
 15 |     def callback(data):
 16 |         return {filename: [item.get(each) for item in data]}
 17 | 
 18 |     transformers[filename] = (callback, pk)
 19 | 
 20 | 
 21 | def register_each(filename, pk=None):
 22 |     def inner(fn):
 23 |         def callback(data):
 24 |             return {filename: [fn(item) for item in data]}
 25 | 
 26 |         transformers[filename] = (callback, pk)
 27 | 
 28 |     return inner
 29 | 
 30 | 
 31 | def register_multi(filename):
 32 |     def inner(fn):
 33 |         transformers[filename] = (fn, None)
 34 | 
 35 |     return inner
 36 | 
 37 | 
 38 | def register_all(filename):
 39 |     def inner(fn):
 40 |         transformers[filename] = (fn, None)
 41 | 
 42 |     return inner
 43 | 
 44 | 
 45 | def extract_json(contents):
 46 |     # window.YTD.account_creation_ip.part0 = [ ... data ...]
 47 |     contents = contents.strip()
 48 |     if contents.startswith(b"window."):
 49 |         contents = contents.split(b" = ", 1)[1]
 50 |     return json.loads(contents)
 51 | 
 52 | 
 53 | register("account-creation-ip", each="accountCreationIp")
 54 | register("account-suspension", each="accountSuspension")
 55 | register("account-timezone", each="accountTimezone")
 56 | register("account", each="account")
 57 | 
 58 | 
 59 | @register_each("ad-engagements")
 60 | def ad_engagements(item):
 61 |     return item["ad"]["adsUserData"]["adEngagements"]
 62 | 
 63 | 
 64 | @register_each("ad-impressions")
 65 | def ad_impressions(item):
 66 |     return item["ad"]["adsUserData"]["adImpressions"]
 67 | 
 68 | 
 69 | @register_each("ad-mobile-conversions-attributed")
 70 | def ad_mobile_conversions_attributed(item):
 71 |     return item["ad"]["adsUserData"]["attributedMobileAppConversions"]
 72 | 
 73 | 
 74 | @register_each("ad-mobile-conversions-unattributed")
 75 | def ad_mobile_conversions_unattributed(item):
 76 |     return item["ad"]["adsUserData"]["unattributedMobileAppConversions"]
 77 | 
 78 | 
 79 | @register_each("ad-online-conversions-attributed")
 80 | def ad_online_conversions_attributed(item):
 81 |     return item["ad"]["adsUserData"]["attributedOnlineConversions"]
 82 | 
 83 | 
 84 | @register_each("ad-online-conversions-unattributed")
 85 | def ad_online_conversions_unattributed(item):
 86 |     return item["ad"]["adsUserData"]["unattributedOnlineConversions"]
 87 | 
 88 | 
 89 | @register_each("ageinfo")
 90 | def ageinfo(item):
 91 |     return item["ageMeta"]["ageInfo"]
 92 | 
 93 | 
 94 | register("app", each="app", pk="appId")
 95 | 
 96 | register("block", each="blocking", pk="accountId")
 97 | register("connected-application", each="connectedApplication", pk="id")
 98 | # register("contact", ...)
 99 | register("direct-message-group-headers", each="dmConversation", pk="conversationId")
100 | register("direct-messages-group", each="dmConversation", pk="conversationId")
101 | register("direct-message-headers", each="dmConversation", pk="conversationId")
102 | # pk for this one is NOT set, because there are dupes:
103 | # TODO: These actually do warrant separate tables:
104 | register("direct-messages", each="dmConversation")
105 | 
106 | register("email-address-change", each="emailAddressChange")
107 | register("follower", each="follower", pk="accountId")
108 | register("following", each="following", pk="accountId")
109 | register("ip-audit", each="ipAudit")
110 | register("like", each="like", pk="tweetId")
111 | 
112 | 
113 | @register_all("lists-created")
114 | def lists_created(data):
115 |     return {"lists-created": _list_from_common(data)}
116 | 
117 | 
118 | @register_all("lists-member")
119 | def lists_member(data):
120 |     return {"lists-member": _list_from_common(data)}
121 | 
122 | 
123 | @register_all("lists-subscribed")
124 | def lists_subscribed(data):
125 |     return {"lists-subscribed": _list_from_common(data)}
126 | 
127 | 
128 | register("moment", each="moment", pk="momentId")
129 | # register("mute", ...)
130 | 
131 | 
132 | @register_all("ni-devices")
133 | def lists_created(data):
134 |     devices = []
135 |     for block in data:
136 |         block = block["niDeviceResponse"]
137 |         category = list(block.keys())[0]
138 |         details = list(block.values())[0]
139 |         details["category"] = category
140 |         devices.append(details)
141 |     return {"ne-devices": devices}
142 | 
143 | 
144 | # Skipped all the periscope- stuff for the moment
145 | 
146 | 
147 | @register_multi("personalization")
148 | def personalization(data):
149 |     data = data[0]
150 |     # As a multi, we get to return a dict of
151 |     # table names => list of objects to insert
152 |     to_create = {}
153 |     demographics = data["p13nData"]["demographics"]
154 |     to_create["personalization-demographics-languages"] = demographics["languages"]
155 |     to_create["personalization-demographics-genderInfo"] = [demographics["genderInfo"]]
156 |     to_create["personalization-interests"] = data["p13nData"]["interests"]["interests"]
157 |     to_create["personalization-partnerInterests"] = data["p13nData"]["interests"][
158 |         "partnerInterests"
159 |     ]
160 |     to_create["personalization-advertisers"] = [
161 |         {"name": name}
162 |         for name in data["p13nData"]["interests"]["audienceAndAdvertisers"][
163 |             "advertisers"
164 |         ]
165 |     ]
166 |     to_create["personalization-num-audiences"] = [
167 |         {
168 |             "numAudiences": data["p13nData"]["interests"]["audienceAndAdvertisers"][
169 |                 "numAudiences"
170 |             ]
171 |         }
172 |     ]
173 |     to_create["personalization-shows"] = [
174 |         {"name": name} for name in data["p13nData"]["interests"]["shows"]
175 |     ]
176 |     to_create["personalization-locationHistory"] = [
177 |         {"name": name} for name in data["p13nData"]["locationHistory"]
178 |     ]
179 |     to_create["personalization-inferredAgeInfo"] = [data["p13nData"]["inferredAgeInfo"]]
180 |     return to_create
181 | 
182 | 
183 | register("phone-number", each="device")
184 | register("profile", each="profile")
185 | # protected-history.js
186 | 
187 | register("saved-search", each="savedSearch", pk="savedSearchId")
188 | # screen-name-change.js
189 | 
190 | 
191 | @register_each("tweet", pk="id")
192 | def tweet(item):
193 |     # Older versions of the archive have the tweet data at the top level of the
194 |     # item; newer versions have it all in a 'tweet' sub-key.
195 |     if "tweet" in item:
196 |         item = item["tweet"]
197 | 
198 |     for key in item:
199 |         if key == "id" or key.endswith("_id"):
200 |             item[key] = int(item[key])
201 | 
202 |     # Handle some columns that are sometimes missing
203 |     optional_columns = ["possibly_sensitive", "coordinates", "geo", "extended_entities"]
204 |     for col in optional_columns:
205 |         item.setdefault(col, None)
206 | 
207 |     return item
208 | 
209 | 
210 | register("verified", each="verified")
211 | 
212 | 
213 | def _list_from_common(data):
214 |     lists = []
215 |     for block in data:
216 |         info = block["userListInfo"]
217 |         if "urls" in info:
218 |             urls = info["urls"]
219 |         elif "url" in info:
220 |             urls = [info["url"]]
221 |         else:
222 |             urls = []
223 |         for url in urls:
224 |             bits = url.split("/")
225 |             lists.append({"screen_name": bits[-3], "list_slug": bits[-1]})
226 |     return lists
227 | 
228 | 
229 | def import_from_file(db, filename, content):
230 |     assert filename.endswith(".js"), "{} does not end with .js".format(filename)
231 |     existing_tables = set(db.table_names())
232 |     filename = filename[: -len(".js")]
233 |     if filename not in transformers:
234 |         if filename not in IGNORE:
235 |             print("{}: not yet implemented".format(filename))
236 |         return
237 |     transformer, pk = transformers.get(filename)
238 |     data = extract_json(content)
239 |     to_insert = transformer(data)
240 |     for table, rows in to_insert.items():
241 |         table_name = "archive_{}".format(table.replace("-", "_"))
242 |         # Drop and re-create if it already exists
243 |         if table_name in existing_tables:
244 |             db[table_name].drop()
245 |         if pk is not None:
246 |             db[table_name].insert_all(rows, pk=pk, replace=True)
247 |         else:
248 |             db[table_name].insert_all(rows, hash_id="pk", replace=True)
249 | 


--------------------------------------------------------------------------------
/twitter_to_sqlite/cli.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import hashlib
  3 | import json
  4 | import os
  5 | import pathlib
  6 | import time
  7 | 
  8 | import click
  9 | 
 10 | from twitter_to_sqlite import archive
 11 | from twitter_to_sqlite import utils
 12 | 
 13 | 
 14 | def add_identifier_options(subcommand):
 15 |     for decorator in reversed(
 16 |         (
 17 |             click.argument("identifiers", type=str, nargs=-1),
 18 |             click.option(
 19 |                 "--attach",
 20 |                 type=click.Path(
 21 |                     file_okay=True, dir_okay=False, allow_dash=False, exists=True
 22 |                 ),
 23 |                 multiple=True,
 24 |                 help="Additional database file to attach",
 25 |             ),
 26 |             click.option("--sql", help="SQL query to fetch identifiers to use"),
 27 |         )
 28 |     ):
 29 |         subcommand = decorator(subcommand)
 30 |     return subcommand
 31 | 
 32 | 
 33 | @click.group()
 34 | @click.version_option()
 35 | def cli():
 36 |     "Save data from Twitter to a SQLite database"
 37 | 
 38 | 
 39 | @cli.command()
 40 | @click.argument("url")
 41 | @click.option(
 42 |     "-a",
 43 |     "--auth",
 44 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
 45 |     default="auth.json",
 46 |     help="Path to auth.json token file",
 47 | )
 48 | def fetch(url, auth):
 49 |     "Make an authenticated request to the Twitter API"
 50 |     auth = json.load(open(auth))
 51 |     session = utils.session_for_auth(auth)
 52 |     click.echo(json.dumps(session.get(url).json(), indent=4))
 53 | 
 54 | 
 55 | @cli.command()
 56 | @click.option(
 57 |     "-a",
 58 |     "--auth",
 59 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
 60 |     default="auth.json",
 61 |     help="Path to save tokens to, defaults to auth.json",
 62 | )
 63 | def auth(auth):
 64 |     "Save authentication credentials to a JSON file"
 65 |     click.echo("Create an app here: https://developer.twitter.com/en/apps")
 66 |     click.echo("Then navigate to 'Keys and tokens' and paste in the following:")
 67 |     click.echo()
 68 |     api_key = click.prompt("API key")
 69 |     api_secret_key = click.prompt("API secret key")
 70 |     access_token = click.prompt("Access token")
 71 |     access_token_secret = click.prompt("Access token secret")
 72 |     open(auth, "w").write(
 73 |         json.dumps(
 74 |             {
 75 |                 "api_key": api_key,
 76 |                 "api_secret_key": api_secret_key,
 77 |                 "access_token": access_token,
 78 |                 "access_token_secret": access_token_secret,
 79 |             },
 80 |             indent=4,
 81 |         )
 82 |         + "\n"
 83 |     )
 84 | 
 85 | 
 86 | @cli.command()
 87 | @click.argument(
 88 |     "db_path",
 89 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
 90 |     required=True,
 91 | )
 92 | @add_identifier_options
 93 | @click.option(
 94 |     "-a",
 95 |     "--auth",
 96 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
 97 |     default="auth.json",
 98 |     help="Path to auth.json token file",
 99 | )
100 | @click.option("--ids", is_flag=True, help="Treat input as user IDs, not screen names")
101 | @click.option("--silent", is_flag=True, help="Disable progress bar")
102 | def followers(db_path, identifiers, attach, sql, auth, ids, silent):
103 |     "Save followers for specified users (defaults to authenticated user)"
104 |     _shared_friends_followers(
105 |         db_path, identifiers, attach, sql, auth, ids, silent, "followers"
106 |     )
107 | 
108 | 
109 | def _shared_friends_followers(
110 |     db_path, identifiers, attach, sql, auth, ids, silent, noun
111 | ):
112 |     assert noun in ("friends", "followers")
113 |     auth = json.load(open(auth))
114 |     session = utils.session_for_auth(auth)
115 |     db = utils.open_database(db_path)
116 | 
117 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
118 | 
119 |     if not identifiers:
120 |         profile = utils.get_profile(db, session)
121 |         identifiers = [profile["screen_name"]]
122 | 
123 |     for identifier in identifiers:
124 |         if ids:
125 |             kwargs = {"user_id": identifier}
126 |         else:
127 |             kwargs = {"screen_name": identifier}
128 | 
129 |         fetched = []
130 |         # Get the follower count, so we can have a progress bar
131 |         count = 0
132 | 
133 |         profile = utils.get_profile(db, session, **kwargs)
134 |         screen_name = profile["screen_name"]
135 |         user_id = profile["id"]
136 | 
137 |         save_users_kwargs = {}
138 |         if noun == "followers":
139 |             save_users_kwargs["followed_id"] = user_id
140 |         elif noun == "friends":
141 |             save_users_kwargs["follower_id"] = user_id
142 | 
143 |         def go(update):
144 |             for users_chunk in utils.fetch_user_list_chunks(
145 |                 session, user_id, screen_name, noun=noun
146 |             ):
147 |                 fetched.extend(users_chunk)
148 |                 utils.save_users(db, users_chunk, **save_users_kwargs)
149 |                 update(len(users_chunk))
150 | 
151 |         if not silent:
152 |             count = profile["{}_count".format(noun)]
153 |             with click.progressbar(
154 |                 length=count,
155 |                 label="Importing {:,} {} for @{}".format(count, noun, screen_name),
156 |             ) as bar:
157 |                 go(bar.update)
158 |         else:
159 |             go(lambda x: None)
160 | 
161 | 
162 | @cli.command()
163 | @click.argument(
164 |     "db_path",
165 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
166 |     required=True,
167 | )
168 | @add_identifier_options
169 | @click.option(
170 |     "-a",
171 |     "--auth",
172 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
173 |     default="auth.json",
174 |     help="Path to auth.json token file",
175 | )
176 | @click.option("--ids", is_flag=True, help="Treat input as user IDs, not screen names")
177 | @click.option("--silent", is_flag=True, help="Disable progress bar")
178 | def friends(db_path, identifiers, attach, sql, auth, ids, silent):
179 |     "Save friends for specified users (defaults to authenticated user)"
180 |     _shared_friends_followers(
181 |         db_path, identifiers, attach, sql, auth, ids, silent, "friends"
182 |     )
183 | 
184 | 
185 | @cli.command()
186 | @click.argument(
187 |     "db_path",
188 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
189 |     required=True,
190 | )
191 | @click.option(
192 |     "-a",
193 |     "--auth",
194 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
195 |     default="auth.json",
196 |     help="Path to auth.json token file",
197 | )
198 | @click.option("--user_id", help="Numeric user ID")
199 | @click.option("--screen_name", help="Screen name")
200 | @click.option("--stop_after", type=int, help="Stop after this many")
201 | def favorites(db_path, auth, user_id, screen_name, stop_after):
202 |     "Save tweets favorited by specified user"
203 |     auth = json.load(open(auth))
204 |     session = utils.session_for_auth(auth)
205 |     db = utils.open_database(db_path)
206 |     profile = utils.get_profile(db, session, user_id, screen_name)
207 |     with click.progressbar(
208 |         utils.fetch_favorites(session, db, user_id, screen_name, stop_after),
209 |         label="Importing favorites",
210 |         show_pos=True,
211 |     ) as bar:
212 |         utils.save_tweets(db, bar, favorited_by=profile["id"])
213 | 
214 | 
215 | @cli.command(name="user-timeline")
216 | @click.argument(
217 |     "db_path",
218 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
219 |     required=True,
220 | )
221 | @add_identifier_options
222 | @click.option(
223 |     "-a",
224 |     "--auth",
225 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
226 |     default="auth.json",
227 |     help="Path to auth.json token file",
228 | )
229 | @click.option("--ids", is_flag=True, help="Treat input as user IDs, not screen names")
230 | @click.option("--stop_after", type=int, help="Only pull this number of recent tweets")
231 | @click.option("--user_id", help="Numeric user ID", hidden=True)
232 | @click.option("--screen_name", help="Screen name", hidden=True)
233 | @click.option(
234 |     "--since",
235 |     is_flag=True,
236 |     help="Pull tweets since last retrieved tweet",
237 | )
238 | @click.option("--since_id", type=str, help="Pull tweets since this Tweet ID")
239 | def user_timeline(
240 |     db_path,
241 |     identifiers,
242 |     attach,
243 |     sql,
244 |     auth,
245 |     ids,
246 |     stop_after,
247 |     user_id,
248 |     screen_name,
249 |     since,
250 |     since_id,
251 | ):
252 |     "Save tweets posted by specified user"
253 |     auth = json.load(open(auth))
254 |     session = utils.session_for_auth(auth)
255 |     db = utils.open_database(db_path)
256 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
257 | 
258 |     # Backwards compatible support for old --user_id and --screen_name options
259 |     if screen_name:
260 |         if ids:
261 |             raise click.ClickException("Cannot use --screen_name with --ids")
262 |         identifiers.append(screen_name)
263 | 
264 |     if user_id:
265 |         if not identifiers:
266 |             identifiers = [user_id]
267 |         else:
268 |             if not ids:
269 |                 raise click.ClickException("Use --user_id with --ids")
270 |             identifiers.append(user_id)
271 | 
272 |     # If identifiers is empty, fetch the authenticated user
273 |     fetch_profiles = True
274 |     if not identifiers:
275 |         fetch_profiles = False
276 |         profile = utils.get_profile(db, session, user_id, screen_name)
277 |         identifiers = [profile["screen_name"]]
278 |         ids = False
279 | 
280 |     format_string = (
281 |         "@{:" + str(max(len(str(identifier)) for identifier in identifiers)) + "}"
282 |     )
283 | 
284 |     for identifier in identifiers:
285 |         kwargs = {}
286 |         if ids:
287 |             kwargs["user_id"] = identifier
288 |         else:
289 |             kwargs["screen_name"] = identifier
290 |         if fetch_profiles:
291 |             profile = utils.get_profile(db, session, **kwargs)
292 |         else:
293 |             profile = db["users"].get(profile["id"])
294 |         expected_length = profile["statuses_count"]
295 | 
296 |         if since or since_id:
297 |             expected_length = None
298 | 
299 |         with click.progressbar(
300 |             utils.fetch_user_timeline(
301 |                 session,
302 |                 db,
303 |                 stop_after=stop_after,
304 |                 since_id=since_id,
305 |                 since=since,
306 |                 **kwargs
307 |             ),
308 |             length=expected_length,
309 |             label=format_string.format(profile["screen_name"]),
310 |             show_pos=True,
311 |         ) as bar:
312 |             # Save them 100 at a time
313 |             chunk = []
314 |             for tweet in bar:
315 |                 chunk.append(tweet)
316 |                 if len(chunk) >= 100:
317 |                     utils.save_tweets(db, chunk)
318 |                     chunk = []
319 |             if chunk:
320 |                 utils.save_tweets(db, chunk)
321 | 
322 | 
323 | @cli.command(name="home-timeline")
324 | @click.argument(
325 |     "db_path",
326 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
327 |     required=True,
328 | )
329 | @click.option(
330 |     "-a",
331 |     "--auth",
332 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
333 |     default="auth.json",
334 |     help="Path to auth.json token file",
335 | )
336 | @click.option(
337 |     "--since",
338 |     is_flag=True,
339 |     help="Pull tweets since last retrieved tweet",
340 | )
341 | @click.option("--since_id", type=str, help="Pull tweets since this Tweet ID")
342 | def home_timeline(db_path, auth, since, since_id):
343 |     "Save tweets from timeline for authenticated user"
344 |     _shared_timeline(
345 |         db_path,
346 |         auth,
347 |         since,
348 |         since_id,
349 |         table="timeline_tweets",
350 |         api_url="https://api.twitter.com/1.1/statuses/home_timeline.json",
351 |         since_type="home",
352 |     )
353 | 
354 | 
355 | @cli.command(name="mentions-timeline")
356 | @click.argument(
357 |     "db_path",
358 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
359 |     required=True,
360 | )
361 | @click.option(
362 |     "-a",
363 |     "--auth",
364 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
365 |     default="auth.json",
366 |     help="Path to auth.json token file",
367 | )
368 | @click.option(
369 |     "--since",
370 |     is_flag=True,
371 |     help="Pull tweets since last retrieved mention",
372 | )
373 | @click.option("--since_id", type=str, help="Pull mentions since this Tweet ID")
374 | def mentions_timeline(db_path, auth, since, since_id):
375 |     "Save tweets that mention the authenticated user"
376 |     _shared_timeline(
377 |         db_path,
378 |         auth,
379 |         since,
380 |         since_id,
381 |         table="mentions_tweets",
382 |         api_url="https://api.twitter.com/1.1/statuses/mentions_timeline.json",
383 |         sleep=10,
384 |         since_type="mentions",
385 |     )
386 | 
387 | 
388 | def _shared_timeline(
389 |     db_path, auth, since, since_id, table, api_url, sleep=1, since_type=None
390 | ):
391 |     auth = json.load(open(auth))
392 |     session = utils.session_for_auth(auth)
393 |     db = utils.open_database(db_path)
394 |     profile = utils.get_profile(db, session)
395 |     expected_length = 800
396 |     since_key = profile["id"]
397 | 
398 |     with click.progressbar(
399 |         utils.fetch_timeline(
400 |             session,
401 |             api_url,
402 |             db,
403 |             sleep=sleep,
404 |             since=since,
405 |             since_id=since_id,
406 |             since_type=since_type,
407 |             since_key=since_key,
408 |         ),
409 |         length=expected_length,
410 |         label="Importing tweets",
411 |         show_pos=True,
412 |     ) as bar:
413 |         # Save them 100 at a time
414 |         def save_chunk(db, chunk):
415 |             utils.save_tweets(db, chunk)
416 |             # Record who's timeline they came from
417 |             db[table].insert_all(
418 |                 [{"user": profile["id"], "tweet": tweet["id"]} for tweet in chunk],
419 |                 pk=("user", "tweet"),
420 |                 foreign_keys=("user", "tweet"),
421 |                 replace=True,
422 |             )
423 | 
424 |         chunk = []
425 |         for tweet in bar:
426 |             chunk.append(tweet)
427 |             if len(chunk) >= 100:
428 |                 save_chunk(db, chunk)
429 |                 chunk = []
430 |         if chunk:
431 |             save_chunk(db, chunk)
432 | 
433 | 
434 | @cli.command(name="users-lookup")
435 | @click.argument(
436 |     "db_path",
437 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
438 |     required=True,
439 | )
440 | @add_identifier_options
441 | @click.option(
442 |     "-a",
443 |     "--auth",
444 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
445 |     default="auth.json",
446 |     help="Path to auth.json token file",
447 | )
448 | @click.option("--ids", is_flag=True, help="Treat input as user IDs, not screen names")
449 | def users_lookup(db_path, identifiers, attach, sql, auth, ids):
450 |     "Fetch user accounts"
451 |     auth = json.load(open(auth))
452 |     session = utils.session_for_auth(auth)
453 |     db = utils.open_database(db_path)
454 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
455 |     for batch in utils.fetch_user_batches(session, identifiers, ids):
456 |         utils.save_users(db, batch)
457 | 
458 | 
459 | @cli.command(name="statuses-lookup")
460 | @click.argument(
461 |     "db_path",
462 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
463 |     required=True,
464 | )
465 | @add_identifier_options
466 | @click.option(
467 |     "-a",
468 |     "--auth",
469 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
470 |     default="auth.json",
471 |     help="Path to auth.json token file",
472 | )
473 | @click.option(
474 |     "--skip-existing", is_flag=True, help="Skip tweets that are already in the DB"
475 | )
476 | @click.option("--silent", is_flag=True, help="Disable progress bar")
477 | def statuses_lookup(db_path, identifiers, attach, sql, auth, skip_existing, silent):
478 |     "Fetch tweets by their IDs"
479 |     auth = json.load(open(auth))
480 |     session = utils.session_for_auth(auth)
481 |     db = utils.open_database(db_path)
482 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
483 |     if skip_existing:
484 |         existing_ids = set(
485 |             r[0] for r in db.conn.execute("select id from tweets").fetchall()
486 |         )
487 |         identifiers = [i for i in identifiers if int(i) not in existing_ids]
488 |     if silent:
489 |         for batch in utils.fetch_status_batches(session, identifiers):
490 |             utils.save_tweets(db, batch)
491 |     else:
492 |         # Do it with a progress bar
493 |         count = len(identifiers)
494 |         with click.progressbar(
495 |             length=count,
496 |             label="Importing {:,} tweet{}".format(count, "" if count == 1 else "s"),
497 |         ) as bar:
498 |             for batch in utils.fetch_status_batches(session, identifiers):
499 |                 utils.save_tweets(db, batch)
500 |                 bar.update(len(batch))
501 | 
502 | 
503 | @cli.command(name="lists")
504 | @click.argument(
505 |     "db_path",
506 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
507 |     required=True,
508 | )
509 | @add_identifier_options
510 | @click.option(
511 |     "-a",
512 |     "--auth",
513 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
514 |     default="auth.json",
515 |     help="Path to auth.json token file",
516 | )
517 | @click.option("--ids", is_flag=True, help="Treat input as user IDs, not screen_names")
518 | @click.option("--members", is_flag=True, help="Retrieve members for each list")
519 | def lists(db_path, identifiers, attach, sql, auth, ids, members):
520 |     "Fetch lists belonging to specified users"
521 |     auth = json.load(open(auth))
522 |     session = utils.session_for_auth(auth)
523 |     db = utils.open_database(db_path)
524 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
525 |     # Make sure we have saved these users to the database
526 |     for batch in utils.fetch_user_batches(session, identifiers, ids):
527 |         utils.save_users(db, batch)
528 |     first = True
529 |     for identifier in identifiers:
530 |         if ids:
531 |             kwargs = {"user_id": identifier}
532 |         else:
533 |             kwargs = {"screen_name": identifier}
534 |         fetched_lists = utils.fetch_lists(db, session, **kwargs)
535 |         if members:
536 |             for new_list in fetched_lists:
537 |                 utils.fetch_and_save_list(
538 |                     db, session, new_list["full_name"].rstrip("@")
539 |                 )
540 |         if not first:
541 |             # Rate limit is one per minute
542 |             first = False
543 |             time.sleep(60)
544 | 
545 | 
546 | @cli.command(name="list-members")
547 | @click.argument(
548 |     "db_path",
549 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
550 |     required=True,
551 | )
552 | @click.argument("identifiers", type=str, nargs=-1)
553 | @click.option(
554 |     "-a",
555 |     "--auth",
556 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
557 |     default="auth.json",
558 |     help="Path to auth.json token file",
559 | )
560 | @click.option(
561 |     "--ids", is_flag=True, help="Treat input as list IDs, not user/slug strings"
562 | )
563 | def list_members(db_path, identifiers, auth, ids):
564 |     "Fetch lists - accepts one or more screen_name/list_slug identifiers"
565 |     auth = json.load(open(auth))
566 |     session = utils.session_for_auth(auth)
567 |     db = utils.open_database(db_path)
568 |     for identifier in identifiers:
569 |         utils.fetch_and_save_list(db, session, identifier, ids)
570 | 
571 | 
572 | @cli.command(name="followers-ids")
573 | @click.argument(
574 |     "db_path",
575 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
576 |     required=True,
577 | )
578 | @add_identifier_options
579 | @click.option(
580 |     "-a",
581 |     "--auth",
582 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
583 |     default="auth.json",
584 |     help="Path to auth.json token file",
585 | )
586 | @click.option(
587 |     "--ids", is_flag=True, help="Treat input as list IDs, not user/slug strings"
588 | )
589 | @click.option(
590 |     "--sleep", type=int, default=61, help="Seconds to sleep between API calls"
591 | )
592 | def followers_ids(db_path, identifiers, attach, sql, auth, ids, sleep):
593 |     "Populate followers table with IDs of account followers"
594 |     _shared_friends_ids_followers_ids(
595 |         db_path,
596 |         identifiers,
597 |         attach,
598 |         sql,
599 |         auth,
600 |         ids,
601 |         sleep,
602 |         api_url="https://api.twitter.com/1.1/followers/ids.json",
603 |         first_key="followed_id",
604 |         second_key="follower_id",
605 |     )
606 | 
607 | 
608 | @cli.command(name="friends-ids")
609 | @click.argument(
610 |     "db_path",
611 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
612 |     required=True,
613 | )
614 | @add_identifier_options
615 | @click.option(
616 |     "-a",
617 |     "--auth",
618 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
619 |     default="auth.json",
620 |     help="Path to auth.json token file",
621 | )
622 | @click.option(
623 |     "--ids", is_flag=True, help="Treat input as list IDs, not user/slug strings"
624 | )
625 | @click.option(
626 |     "--sleep", type=int, default=61, help="Seconds to sleep between API calls"
627 | )
628 | def friends_ids(db_path, identifiers, attach, sql, auth, ids, sleep):
629 |     "Populate followers table with IDs of account friends"
630 |     _shared_friends_ids_followers_ids(
631 |         db_path,
632 |         identifiers,
633 |         attach,
634 |         sql,
635 |         auth,
636 |         ids,
637 |         sleep,
638 |         api_url="https://api.twitter.com/1.1/friends/ids.json",
639 |         first_key="follower_id",
640 |         second_key="followed_id",
641 |     )
642 | 
643 | 
644 | @cli.command()
645 | @click.argument(
646 |     "db_path",
647 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
648 |     required=True,
649 | )
650 | @click.argument("track", type=str, required=True, nargs=-1)
651 | @click.option(
652 |     "-a",
653 |     "--auth",
654 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
655 |     default="auth.json",
656 |     help="Path to auth.json token file",
657 | )
658 | @click.option("--verbose", is_flag=True, help="Verbose mode: display every tweet")
659 | def track(db_path, track, auth, verbose):
660 |     "Experimental: Save tweets matching these keywords in real-time"
661 |     auth = json.load(open(auth))
662 |     session = utils.session_for_auth(auth)
663 |     db = utils.open_database(db_path)
664 |     for tweet in utils.stream_filter(session, track=track):
665 |         if verbose:
666 |             print(json.dumps(tweet, indent=2))
667 |         with db.conn:
668 |             utils.save_tweets(db, [tweet])
669 | 
670 | 
671 | @cli.command()
672 | @click.argument(
673 |     "db_path",
674 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
675 |     required=True,
676 | )
677 | @add_identifier_options
678 | @click.option("--ids", is_flag=True, help="Treat input as user IDs, not screen names")
679 | @click.option(
680 |     "-a",
681 |     "--auth",
682 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
683 |     default="auth.json",
684 |     help="Path to auth.json token file",
685 | )
686 | @click.option("--verbose", is_flag=True, help="Verbose mode: display every tweet")
687 | def follow(db_path, identifiers, attach, sql, ids, auth, verbose):
688 |     "Experimental: Follow these Twitter users and save tweets in real-time"
689 |     auth = json.load(open(auth))
690 |     session = utils.session_for_auth(auth)
691 |     db = utils.open_database(db_path)
692 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
693 |     # Make sure we have saved these users to the database
694 |     for batch in utils.fetch_user_batches(session, identifiers, ids):
695 |         utils.save_users(db, batch)
696 |     # Ensure we have user IDs, not screen names
697 |     if ids:
698 |         follow = identifiers
699 |     else:
700 |         follow = utils.user_ids_for_screen_names(db, identifiers)
701 |     # Start streaming:
702 |     for tweet in utils.stream_filter(session, follow=follow):
703 |         if verbose:
704 |             print(json.dumps(tweet, indent=2))
705 |         with db.conn:
706 |             utils.save_tweets(db, [tweet])
707 | 
708 | 
709 | def _shared_friends_ids_followers_ids(
710 |     db_path, identifiers, attach, sql, auth, ids, sleep, api_url, first_key, second_key
711 | ):
712 |     auth = json.load(open(auth))
713 |     session = utils.session_for_auth(auth)
714 |     db = utils.open_database(db_path)
715 |     identifiers = utils.resolve_identifiers(db, identifiers, attach, sql)
716 |     for identifier in identifiers:
717 |         # Make sure this user is saved
718 |         arg_user_id = identifier if ids else None
719 |         arg_screen_name = None if ids else identifier
720 |         profile = utils.get_profile(db, session, arg_user_id, arg_screen_name)
721 |         user_id = profile["id"]
722 |         args = {("user_id" if ids else "screen_name"): identifier}
723 |         for id_batch in utils.cursor_paginate(
724 |             session, api_url, args, "ids", 5000, sleep
725 |         ):
726 |             first_seen = datetime.datetime.utcnow().isoformat()
727 |             db["following"].insert_all(
728 |                 (
729 |                     {first_key: user_id, second_key: other_id, "first_seen": first_seen}
730 |                     for other_id in id_batch
731 |                 ),
732 |                 ignore=True,
733 |             )
734 |         time.sleep(sleep)
735 | 
736 | 
737 | @cli.command(name="import")
738 | @click.argument(
739 |     "db_path",
740 |     type=click.Path(file_okay=True, dir_okay=True, allow_dash=False),
741 |     required=True,
742 | )
743 | @click.argument(
744 |     "paths",
745 |     type=click.Path(file_okay=True, dir_okay=True, allow_dash=False, exists=True),
746 |     required=True,
747 |     nargs=-1,
748 | )
749 | def import_(db_path, paths):
750 |     """
751 |     Import data from a Twitter exported archive. Input can be the path to a zip
752 |     file, a directory full of .js files or one or more direct .js files.
753 |     """
754 |     db = utils.open_database(db_path)
755 |     for filepath in paths:
756 |         path = pathlib.Path(filepath)
757 |         if path.suffix == ".zip":
758 |             for filename, content in utils.read_archive_js(filepath):
759 |                 archive.import_from_file(db, filename, content)
760 |         elif path.is_dir():
761 |             # Import every .js file in this directory
762 |             for filepath in path.glob("*.js"):
763 |                 archive.import_from_file(db, filepath.name, open(filepath, "rb").read())
764 |         elif path.suffix == ".js":
765 |             archive.import_from_file(db, path.name, open(path, "rb").read())
766 |         else:
767 |             raise click.ClickException("Path must be a .js or .zip file or a directory")
768 | 
769 | 
770 | @cli.command()
771 | @click.argument(
772 |     "db_path",
773 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=False),
774 |     required=True,
775 | )
776 | @click.argument("q")
777 | @click.option(
778 |     "-a",
779 |     "--auth",
780 |     type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True),
781 |     default="auth.json",
782 |     help="Path to auth.json token file",
783 | )
784 | @click.option(
785 |     "--since",
786 |     is_flag=True,
787 |     help="Pull tweets since last retrieved tweet",
788 | )
789 | @click.option(
790 |     "--geocode",
791 |     type=str,
792 |     help="latitude,longitude,radius - where radius is a number followed by mi or km",
793 | )
794 | @click.option("--lang", type=str, help="ISO 639-1 language code")
795 | @click.option("--locale", type=str, help="Locale: only 'ja' is currently effective")
796 | @click.option("--result_type", type=click.Choice(["mixed", "recent", "popular"]))
797 | @click.option("--count", type=int, default=100, help="Number of results per page")
798 | @click.option("--stop_after", type=int, help="Stop after this many")
799 | @click.option("--since_id", type=str, help="Pull tweets since this Tweet ID")
800 | def search(db_path, q, auth, since, **kwargs):
801 |     """
802 |     Save tweets from a search. Full documentation here:
803 | 
804 |     https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets
805 |     """
806 |     since_id = kwargs.pop("since_id", None)
807 |     stop_after = kwargs.pop("stop_after", None)
808 |     auth = json.load(open(auth))
809 |     session = utils.session_for_auth(auth)
810 |     db = utils.open_database(db_path)
811 | 
812 |     search_args = {"q": q}
813 |     for key, value in kwargs.items():
814 |         if value is not None:
815 |             search_args[key] = value
816 | 
817 |     args_hash = hashlib.sha1(
818 |         json.dumps(search_args, sort_keys=True, separators=(",", ":")).encode("utf8")
819 |     ).hexdigest()
820 | 
821 |     tweets = utils.fetch_timeline(
822 |         session,
823 |         "https://api.twitter.com/1.1/search/tweets.json",
824 |         db,
825 |         search_args,
826 |         sleep=6,
827 |         key="statuses",
828 |         stop_after=stop_after,
829 |         since_id=since_id,
830 |         since_type="search",
831 |         since_key=args_hash,
832 |     )
833 |     chunk = []
834 |     first = True
835 | 
836 |     if not db["search_runs"].exists():
837 |         db["search_runs"].create(
838 |             {"id": int, "name": str, "args": str, "started": str, "hash": str}, pk="id"
839 |         )
840 | 
841 |     def save_chunk(db, search_run_id, chunk):
842 |         utils.save_tweets(db, chunk)
843 |         # Record which search run produced them
844 |         db["search_runs_tweets"].insert_all(
845 |             [{"search_run": search_run_id, "tweet": tweet["id"]} for tweet in chunk],
846 |             pk=("search_run", "tweet"),
847 |             foreign_keys=(
848 |                 ("search_run", "search_runs", "id"),
849 |                 ("tweet", "tweets", "id"),
850 |             ),
851 |             replace=True,
852 |         )
853 | 
854 |     search_run_id = None
855 |     for tweet in tweets:
856 |         if first:
857 |             first = False
858 |             search_run_id = (
859 |                 db["search_runs"]
860 |                 .insert(
861 |                     {
862 |                         "name": search_args["q"],
863 |                         "args": {
864 |                             key: value
865 |                             for key, value in search_args.items()
866 |                             if key not in {"q", "count"}
867 |                         },
868 |                         "started": datetime.datetime.utcnow().isoformat(),
869 |                         "hash": args_hash,
870 |                     },
871 |                     alter=True,
872 |                 )
873 |                 .last_pk
874 |             )
875 |         chunk.append(tweet)
876 |         if len(chunk) >= 10:
877 |             save_chunk(db, search_run_id, chunk)
878 |             chunk = []
879 |     if chunk:
880 |         save_chunk(db, search_run_id, chunk)
881 | 


--------------------------------------------------------------------------------
/twitter_to_sqlite/migrations.py:
--------------------------------------------------------------------------------
 1 | from .utils import extract_and_save_source
 2 | 
 3 | MIGRATIONS = []
 4 | 
 5 | 
 6 | def migration(fn):
 7 |     MIGRATIONS.append(fn)
 8 |     return fn
 9 | 
10 | 
11 | @migration
12 | def convert_source_column(db):
13 |     tables = set(db.table_names())
14 |     if "tweets" not in tables:
15 |         return
16 |     # Now we extract any '<a href=...' records from the source
17 |     for id, source in db.conn.execute(
18 |         "select id, source from tweets where source like '<%'"
19 |     ).fetchall():
20 |         db["tweets"].update(id, {"source": extract_and_save_source(db, source)})
21 |     try:
22 |         db["tweets"].create_index(["source"])
23 |     except Exception:
24 |         pass
25 |     try:
26 |         db["tweets"].add_foreign_key("source")
27 |     except Exception:
28 |         pass
29 | 


--------------------------------------------------------------------------------
/twitter_to_sqlite/utils.py:
--------------------------------------------------------------------------------
  1 | import click
  2 | import datetime
  3 | import html
  4 | import json
  5 | import pathlib
  6 | import re
  7 | import sqlite3
  8 | import time
  9 | import urllib.parse
 10 | import zipfile
 11 | 
 12 | from dateutil import parser
 13 | from requests_oauthlib import OAuth1Session
 14 | import sqlite_utils
 15 | 
 16 | # Twitter API error codes
 17 | RATE_LIMIT_ERROR_CODE = 88
 18 | 
 19 | SINCE_ID_TYPES = {
 20 |     "user": 1,
 21 |     "home": 2,
 22 |     "mentions": 3,
 23 |     "search": 4,
 24 | }
 25 | COUNT_HISTORY_TYPES = {
 26 |     "followers": 1,
 27 |     "friends": 2,
 28 |     "listed": 3,
 29 |     # Don't track these - they're uninteresting and really noisy in terms
 30 |     # of writing new rows to the count_history table:
 31 |     # "favourites": 4,
 32 |     # "statuses": 5,
 33 | }
 34 | 
 35 | source_re = re.compile('<a href="(?P<url>.*?)".*?>(?P<name>.*?)</a>')
 36 | 
 37 | 
 38 | class UserDoesNotExist(click.ClickException):
 39 |     def __init__(self, identifier):
 40 |         super().__init__("User '{}' does not exist".format(identifier))
 41 | 
 42 | 
 43 | def open_database(db_path):
 44 |     db = sqlite_utils.Database(db_path)
 45 |     # Only run migrations if this is an existing DB (has tables)
 46 |     if db.tables:
 47 |         migrate(db)
 48 |     return db
 49 | 
 50 | 
 51 | def migrate(db):
 52 |     from twitter_to_sqlite.migrations import MIGRATIONS
 53 | 
 54 |     if "migrations" not in db.table_names():
 55 |         db["migrations"].create({"name": str, "applied": str}, pk="name")
 56 |     applied_migrations = {
 57 |         m[0] for m in db.conn.execute("select name from migrations").fetchall()
 58 |     }
 59 |     for migration in MIGRATIONS:
 60 |         name = migration.__name__
 61 |         if name in applied_migrations:
 62 |             continue
 63 |         migration(db)
 64 |         db["migrations"].insert(
 65 |             {"name": name, "applied": datetime.datetime.utcnow().isoformat()}
 66 |         )
 67 | 
 68 | 
 69 | def session_for_auth(auth):
 70 |     return OAuth1Session(
 71 |         client_key=auth["api_key"],
 72 |         client_secret=auth["api_secret_key"],
 73 |         resource_owner_key=auth["access_token"],
 74 |         resource_owner_secret=auth["access_token_secret"],
 75 |     )
 76 | 
 77 | 
 78 | def fetch_user_list_chunks(
 79 |     session, user_id=None, screen_name=None, sleep=61, noun="followers"
 80 | ):
 81 |     cursor = -1
 82 |     users = []
 83 |     while cursor:
 84 |         headers, body = fetch_user_list(session, cursor, user_id, screen_name, noun)
 85 |         yield body["users"]
 86 |         cursor = body["next_cursor"]
 87 |         if not cursor:
 88 |             break
 89 |         time.sleep(sleep)  # Rate limit = 15 per 15 minutes!
 90 | 
 91 | 
 92 | def fetch_user_list(session, cursor, user_id=None, screen_name=None, noun="followers"):
 93 |     args = user_args(user_id, screen_name)
 94 |     args.update({"count": 200, "cursor": cursor})
 95 |     r = session.get(
 96 |         "https://api.twitter.com/1.1/{}/list.json?".format(noun)
 97 |         + urllib.parse.urlencode(args)
 98 |     )
 99 |     return r.headers, r.json()
100 | 
101 | 
102 | def fetch_lists(db, session, user_id=None, screen_name=None):
103 |     lists_url = "https://api.twitter.com/1.1/lists/ownerships.json"
104 |     args = user_args(user_id, screen_name)
105 |     args["count"] = 1000
106 |     fetched_lists = []
107 |     # For the moment we don't paginate
108 |     for list_row in session.get(lists_url, params=args).json()["lists"]:
109 |         del list_row["id_str"]
110 |         user = list_row.pop("user")
111 |         save_users(db, [user])
112 |         list_row["user"] = user["id"]
113 |         list_row["created_at"] = parser.parse(list_row["created_at"])
114 |         fetched_lists.append(list_row)
115 |     db["lists"].insert_all(fetched_lists, pk="id", foreign_keys=("user",), replace=True)
116 |     return fetched_lists
117 | 
118 | 
119 | def get_profile(db, session, user_id=None, screen_name=None):
120 |     if not (user_id or screen_name):
121 |         profile = session.get(
122 |             "https://api.twitter.com/1.1/account/verify_credentials.json"
123 |         ).json()
124 |     else:
125 |         args = user_args(user_id, screen_name)
126 |         url = "https://api.twitter.com/1.1/users/show.json"
127 |         if args:
128 |             url += "?" + urllib.parse.urlencode(args)
129 |         response = session.get(url)
130 |         if response.status_code == 404:
131 |             raise UserDoesNotExist(screen_name or user_id)
132 |         profile = response.json()
133 |     save_users(db, [profile])
134 |     return profile
135 | 
136 | 
137 | def fetch_timeline(
138 |     session,
139 |     url,
140 |     db,
141 |     args=None,
142 |     sleep=1,
143 |     stop_after=None,
144 |     key=None,
145 |     since_id=None,
146 |     since=False,
147 |     since_type=None,
148 |     since_key=None,
149 | ):
150 |     # See https://developer.twitter.com/en/docs/tweets/timelines/guides/working-with-timelines
151 |     if since and since_id:
152 |         raise click.ClickException("Use either --since or --since_id, not both")
153 | 
154 |     since_type_id = None
155 |     last_since_id = None
156 |     if since_type is not None:
157 |         assert since_key is not None
158 |         since_type_id = SINCE_ID_TYPES[since_type]
159 |         # Figure out the last since_id in case we need it
160 |         try:
161 |             last_since_id = db.conn.execute(
162 |                 """
163 |                 select since_id from since_ids
164 |                 where type = ? and key = ?
165 |                 """,
166 |                 [since_type_id, since_key],
167 |             ).fetchall()[0][0]
168 |         except (IndexError, sqlite3.OperationalError):
169 |             pass
170 | 
171 |     if since:
172 |         # Load since_id from database
173 |         since_id = last_since_id
174 | 
175 |     args = dict(args or {})
176 |     args["count"] = 200
177 |     if stop_after is not None:
178 |         args["count"] = stop_after
179 |     if since_id:
180 |         args["since_id"] = since_id
181 |     args["tweet_mode"] = "extended"
182 |     min_seen_id = None
183 |     num_rate_limit_errors = 0
184 |     while True:
185 |         if min_seen_id is not None:
186 |             args["max_id"] = min_seen_id - 1
187 |         response = session.get(url, params=args)
188 |         tweets = response.json()
189 |         if "errors" in tweets:
190 |             # Was it a rate limit error? If so sleep and try again
191 |             if RATE_LIMIT_ERROR_CODE == tweets["errors"][0]["code"]:
192 |                 num_rate_limit_errors += 1
193 |                 assert num_rate_limit_errors < 5, "More than 5 rate limit errors"
194 |                 print(
195 |                     "Rate limit exceeded - will sleep 15s and try again {}".format(
196 |                         repr(response.headers)
197 |                     )
198 |                 )
199 |                 time.sleep(15)
200 |                 continue
201 |             else:
202 |                 raise Exception(str(tweets["errors"]))
203 |         if key is not None:
204 |             tweets = tweets[key]
205 |         if not tweets:
206 |             break
207 |         for tweet in tweets:
208 |             yield tweet
209 |         min_seen_id = min(t["id"] for t in tweets)
210 |         max_seen_id = max(t["id"] for t in tweets)
211 |         if last_since_id is not None:
212 |             max_seen_id = max((last_since_id, max_seen_id))
213 |             last_since_id = max_seen_id
214 |         if since_type_id is not None and since_key is not None:
215 |             db["since_ids"].insert(
216 |                 {
217 |                     "type": since_type_id,
218 |                     "key": since_key,
219 |                     "since_id": max_seen_id,
220 |                 },
221 |                 replace=True,
222 |             )
223 |         if stop_after is not None:
224 |             break
225 |         time.sleep(sleep)
226 | 
227 | 
228 | def fetch_user_timeline(
229 |     session,
230 |     db,
231 |     user_id=None,
232 |     screen_name=None,
233 |     stop_after=None,
234 |     since_id=None,
235 |     since=False,
236 | ):
237 |     args = user_args(user_id, screen_name)
238 |     yield from fetch_timeline(
239 |         session,
240 |         "https://api.twitter.com/1.1/statuses/user_timeline.json",
241 |         db,
242 |         args,
243 |         sleep=1,
244 |         stop_after=stop_after,
245 |         since_id=since_id,
246 |         since_type="user",
247 |         since_key="id:{}".format(user_id) if user_id else screen_name,
248 |         since=since,
249 |     )
250 | 
251 | 
252 | def fetch_favorites(session, db, user_id=None, screen_name=None, stop_after=None):
253 |     args = user_args(user_id, screen_name)
254 |     # Rate limit 75/15 mins = 5/minute = every 12 seconds
255 |     sleep = 12
256 |     yield from fetch_timeline(
257 |         session,
258 |         "https://api.twitter.com/1.1/favorites/list.json",
259 |         db,
260 |         args,
261 |         sleep=sleep,
262 |         stop_after=stop_after,
263 |     )
264 | 
265 | 
266 | def user_args(user_id, screen_name):
267 |     args = {}
268 |     if user_id:
269 |         args["user_id"] = user_id
270 |     if screen_name:
271 |         args["screen_name"] = screen_name
272 |     return args
273 | 
274 | 
275 | def expand_entities(s, entities):
276 |     for _, ents in entities.items():
277 |         for ent in ents:
278 |             if "url" in ent:
279 |                 replacement = ent["expanded_url"] or ent["url"]
280 |                 s = s.replace(ent["url"], replacement)
281 |     return s
282 | 
283 | 
284 | def transform_user(user):
285 |     user["created_at"] = parser.parse(user["created_at"])
286 |     if user["description"] and "description" in user.get("entities", {}):
287 |         user["description"] = expand_entities(
288 |             user["description"], user["entities"]["description"]
289 |         )
290 |     if user["url"] and "url" in user.get("entities", {}):
291 |         user["url"] = expand_entities(user["url"], user["entities"]["url"])
292 |     user.pop("entities", None)
293 |     user.pop("status", None)
294 |     to_remove = [k for k in user if k.endswith("_str")]
295 |     for key in to_remove:
296 |         del user[key]
297 | 
298 | 
299 | def transform_tweet(tweet):
300 |     tweet["full_text"] = html.unescape(
301 |         expand_entities(tweet["full_text"], tweet.pop("entities"))
302 |     )
303 |     to_remove = [k for k in tweet if k.endswith("_str")] + [
304 |         "quoted_status_id",
305 |         "quoted_status_permalink",
306 |     ]
307 |     for key in to_remove:
308 |         if key in tweet:
309 |             del tweet[key]
310 |     tweet["created_at"] = parser.parse(tweet["created_at"]).isoformat()
311 | 
312 | 
313 | def ensure_tables(db):
314 |     table_names = set(db.table_names())
315 |     if "places" not in table_names:
316 |         db["places"].create({"id": str}, pk="id")
317 |     if "sources" not in table_names:
318 |         db["sources"].create({"id": str, "name": str, "url": str}, pk="id")
319 |     if "users" not in table_names:
320 |         db["users"].create(
321 |             {
322 |                 "id": int,
323 |                 "screen_name": str,
324 |                 "name": str,
325 |                 "description": str,
326 |                 "location": str,
327 |             },
328 |             pk="id",
329 |         )
330 |         db["users"].enable_fts(
331 |             ["name", "screen_name", "description", "location"], create_triggers=True
332 |         )
333 |     if "tweets" not in table_names:
334 |         db["tweets"].create(
335 |             {
336 |                 "id": int,
337 |                 "user": int,
338 |                 "created_at": str,
339 |                 "full_text": str,
340 |                 "retweeted_status": int,
341 |                 "quoted_status": int,
342 |                 "place": str,
343 |                 "source": str,
344 |             },
345 |             pk="id",
346 |             foreign_keys=(
347 |                 ("user", "users", "id"),
348 |                 ("place", "places", "id"),
349 |                 ("source", "sources", "id"),
350 |             ),
351 |         )
352 |         db["tweets"].enable_fts(["full_text"], create_triggers=True)
353 |         db["tweets"].add_foreign_key("retweeted_status", "tweets")
354 |         db["tweets"].add_foreign_key("quoted_status", "tweets")
355 |     if "following" not in table_names:
356 |         db["following"].create(
357 |             {"followed_id": int, "follower_id": int, "first_seen": str},
358 |             pk=("followed_id", "follower_id"),
359 |             foreign_keys=(
360 |                 ("followed_id", "users", "id"),
361 |                 ("follower_id", "users", "id"),
362 |             ),
363 |         )
364 |     # Ensure following has indexes
365 |     following_indexes = {tuple(i.columns) for i in db["following"].indexes}
366 |     if ("followed_id",) not in following_indexes:
367 |         db["following"].create_index(["followed_id"])
368 |     if ("follower_id",) not in following_indexes:
369 |         db["following"].create_index(["follower_id"])
370 | 
371 |     # Tables for tracking --since
372 |     if "since_ids" not in table_names:
373 |         db["since_id_types"].create(
374 |             {
375 |                 "id": int,
376 |                 "name": str,
377 |             },
378 |             pk="id",
379 |         )
380 |         db["since_id_types"].insert_all(
381 |             [{"id": id, "name": name} for name, id in SINCE_ID_TYPES.items()]
382 |         )
383 |         db["since_ids"].create(
384 |             {"type": int, "key": str, "since_id": int},
385 |             pk=("type", "key"),
386 |             foreign_keys=(("type", "since_id_types", "id"),),
387 |         )
388 | 
389 |     # Tables for recording history of user follower counts etc
390 |     if "count_history" not in table_names:
391 |         db["count_history_types"].create(
392 |             {
393 |                 "id": int,
394 |                 "name": str,
395 |             },
396 |             pk="id",
397 |         )
398 |         db["count_history_types"].insert_all(
399 |             [{"id": id, "name": name} for name, id in COUNT_HISTORY_TYPES.items()]
400 |         )
401 |         db["count_history"].create(
402 |             {"type": int, "user": int, "datetime": str, "count": int},
403 |             pk=("type", "user", "datetime"),
404 |             foreign_keys=(
405 |                 ("type", "count_history_types", "id"),
406 |                 ("user", "users", "id"),
407 |             ),
408 |         )
409 | 
410 | 
411 | def save_tweets(db, tweets, favorited_by=None):
412 |     ensure_tables(db)
413 |     for tweet in tweets:
414 |         transform_tweet(tweet)
415 |         user = tweet.pop("user")
416 |         transform_user(user)
417 |         tweet["user"] = user["id"]
418 |         tweet["source"] = extract_and_save_source(db, tweet["source"])
419 |         if tweet.get("place"):
420 |             db["places"].insert(tweet["place"], pk="id", alter=True, replace=True)
421 |             tweet["place"] = tweet["place"]["id"]
422 |         # extended_entities contains media
423 |         extended_entities = tweet.pop("extended_entities", None)
424 |         # Deal with nested retweeted_status / quoted_status
425 |         nested = []
426 |         for tweet_key in ("quoted_status", "retweeted_status"):
427 |             if tweet.get(tweet_key):
428 |                 nested.append(tweet[tweet_key])
429 |                 tweet[tweet_key] = tweet[tweet_key]["id"]
430 |         if nested:
431 |             save_tweets(db, nested)
432 |         db["users"].insert(user, pk="id", alter=True, replace=True)
433 |         save_user_counts(db, user)
434 |         table = db["tweets"].insert(tweet, pk="id", alter=True, replace=True)
435 |         if favorited_by is not None:
436 |             db["favorited_by"].insert(
437 |                 {"tweet": tweet["id"], "user": favorited_by},
438 |                 pk=("user", "tweet"),
439 |                 foreign_keys=("tweet", "user"),
440 |                 replace=True,
441 |             )
442 |         if extended_entities and extended_entities.get("media"):
443 |             for media in extended_entities["media"]:
444 |                 # TODO: Remove this line when .m2m() grows alter=True
445 |                 db["media"].insert(media, pk="id", alter=True, replace=True)
446 |                 table.m2m("media", media, pk="id")
447 | 
448 | 
449 | def save_users(db, users, followed_id=None, follower_id=None):
450 |     assert not (followed_id and follower_id)
451 |     ensure_tables(db)
452 |     for user in users:
453 |         transform_user(user)
454 |     db["users"].insert_all(users, pk="id", alter=True, replace=True)
455 |     for user in users:
456 |         save_user_counts(db, user)
457 |     if followed_id or follower_id:
458 |         first_seen = datetime.datetime.utcnow().isoformat()
459 |         db["following"].insert_all(
460 |             (
461 |                 {
462 |                     "followed_id": followed_id or user["id"],
463 |                     "follower_id": follower_id or user["id"],
464 |                     "first_seen": first_seen,
465 |                 }
466 |                 for user in users
467 |             ),
468 |             ignore=True,
469 |         )
470 | 
471 | 
472 | def fetch_user_batches(session, ids_or_screen_names, use_ids=False, sleep=1):
473 |     # Yields lists of up to 70 users (tried 100 but got this error:
474 |     # # {'code': 18, 'message': 'Too many terms specified in query.'} )
475 |     batches = []
476 |     batch = []
477 |     for id in ids_or_screen_names:
478 |         batch.append(id)
479 |         if len(batch) == 70:
480 |             batches.append(batch)
481 |             batch = []
482 |     if batch:
483 |         batches.append(batch)
484 |     url = "https://api.twitter.com/1.1/users/lookup.json"
485 |     for batch in batches:
486 |         if use_ids:
487 |             args = {"user_id": ",".join(map(str, batch))}
488 |         else:
489 |             args = {"screen_name": ",".join(batch)}
490 |         users = session.get(url, params=args).json()
491 |         yield users
492 |         time.sleep(sleep)
493 | 
494 | 
495 | def fetch_status_batches(session, tweet_ids, sleep=1):
496 |     # Yields lists of up to 100 tweets
497 |     batches = []
498 |     batch = []
499 |     for id in tweet_ids:
500 |         batch.append(id)
501 |         if len(batch) == 100:
502 |             batches.append(batch)
503 |             batch = []
504 |     if batch:
505 |         batches.append(batch)
506 |     url = "https://api.twitter.com/1.1/statuses/lookup.json"
507 |     for batch in batches:
508 |         args = {"id": ",".join(map(str, batch)), "tweet_mode": "extended"}
509 |         tweets = session.get(url, params=args).json()
510 |         yield tweets
511 |         time.sleep(sleep)
512 | 
513 | 
514 | def resolve_identifiers(db, identifiers, attach, sql):
515 |     if sql:
516 |         if attach:
517 |             for filepath in attach:
518 |                 if ":" in filepath:
519 |                     alias, filepath = filepath.split(":", 1)
520 |                 else:
521 |                     alias = filepath.split("/")[-1].split(".")[0]
522 |                 attach_sql = """
523 |                     ATTACH DATABASE '{}' AS [{}];
524 |                 """.format(
525 |                     str(pathlib.Path(filepath).resolve()), alias
526 |                 )
527 |                 db.conn.execute(attach_sql)
528 |         sql_identifiers = [r[0] for r in db.conn.execute(sql).fetchall()]
529 |     else:
530 |         sql_identifiers = []
531 |     return list(identifiers) + sql_identifiers
532 | 
533 | 
534 | def fetch_and_save_list(db, session, identifier, identifier_is_id=False):
535 |     show_url = "https://api.twitter.com/1.1/lists/show.json"
536 |     args = {}
537 |     if identifier_is_id:
538 |         args["list_id"] = identifier
539 |     else:
540 |         screen_name, slug = identifier.split("/")
541 |         args.update({"owner_screen_name": screen_name, "slug": slug})
542 |     # First fetch the list details
543 |     data = session.get(show_url, params=args).json()
544 |     list_id = data["id"]
545 |     del data["id_str"]
546 |     user = data.pop("user")
547 |     save_users(db, [user])
548 |     data["user"] = user["id"]
549 |     data["created_at"] = parser.parse(data["created_at"])
550 |     db["lists"].insert(data, pk="id", foreign_keys=("user",), replace=True)
551 |     # Now fetch the members
552 |     url = "https://api.twitter.com/1.1/lists/members.json"
553 |     cursor = -1
554 |     while cursor:
555 |         args.update({"count": 5000, "cursor": cursor})
556 |         body = session.get(url, params=args).json()
557 |         users = body["users"]
558 |         save_users(db, users)
559 |         db["list_members"].insert_all(
560 |             ({"list": list_id, "user": user["id"]} for user in users),
561 |             pk=("list", "user"),
562 |             foreign_keys=("list", "user"),
563 |             replace=True,
564 |         )
565 |         cursor = body["next_cursor"]
566 |         if not cursor:
567 |             break
568 |         time.sleep(1)  # Rate limit = 900 per 15 minutes
569 | 
570 | 
571 | def cursor_paginate(session, url, args, key, page_size=200, sleep=None):
572 |     "Execute cursor pagination, yelding 'key' for each page"
573 |     args = dict(args)
574 |     args["page_size"] = page_size
575 |     cursor = -1
576 |     while cursor:
577 |         args["cursor"] = cursor
578 |         r = session.get(url, params=args)
579 |         raise_if_error(r)
580 |         body = r.json()
581 |         yield body[key]
582 |         cursor = body["next_cursor"]
583 |         if not cursor:
584 |             break
585 |         if sleep is not None:
586 |             time.sleep(sleep)
587 | 
588 | 
589 | class TwitterApiError(Exception):
590 |     def __init__(self, headers, body):
591 |         self.headers = headers
592 |         self.body = body
593 | 
594 |     def __repr__(self):
595 |         return "{}: {}".format(self.body, self.headers)
596 | 
597 | 
598 | def raise_if_error(r):
599 |     if "errors" in r.json():
600 |         raise TwitterApiError(r.headers, r.json()["errors"])
601 | 
602 | 
603 | def stream_filter(session, track=None, follow=None, locations=None, language=None):
604 |     session.stream = True
605 |     args = {"tweet_mode": "extended"}
606 |     for key, value in (
607 |         ("track", track),
608 |         ("follow", follow),
609 |         ("locations", locations),
610 |         ("language", language),
611 |     ):
612 |         if value is None:
613 |             continue
614 |         if not isinstance(value, str):
615 |             value = ",".join(map(str, value))
616 |         args[key] = value
617 |     while True:
618 |         response = session.post(
619 |             "https://stream.twitter.com/1.1/statuses/filter.json", params=args
620 |         )
621 |         for line in response.iter_lines(chunk_size=10000):
622 |             if line.strip().startswith(b"{"):
623 |                 tweet = json.loads(line)
624 |                 # Only yield tweet if it has an 'id' and 'created_at'
625 |                 # - otherwise it's probably a maintenance message, see
626 |                 # https://developer.twitter.com/en/docs/tweets/filter-realtime/overview/statuses-filter
627 |                 if "id" in tweet and "created_at" in tweet:
628 |                     # 'Fix' weird tweets from streaming API
629 |                     fix_streaming_tweet(tweet)
630 |                     yield tweet
631 |                 else:
632 |                     print(tweet)
633 |         time.sleep(1)
634 | 
635 | 
636 | def fix_streaming_tweet(tweet):
637 |     if "extended_tweet" in tweet:
638 |         tweet.update(tweet.pop("extended_tweet"))
639 |     if "full_text" not in tweet:
640 |         tweet["full_text"] = tweet["text"]
641 |     if "retweeted_status" in tweet:
642 |         fix_streaming_tweet(tweet["retweeted_status"])
643 |     if "quoted_status" in tweet:
644 |         fix_streaming_tweet(tweet["quoted_status"])
645 | 
646 | 
647 | def user_ids_for_screen_names(db, screen_names):
648 |     sql = "select id from users where lower(screen_name) in ({})".format(
649 |         ", ".join(["?"] * len(screen_names))
650 |     )
651 |     return [
652 |         r[0] for r in db.conn.execute(sql, [s.lower() for s in screen_names]).fetchall()
653 |     ]
654 | 
655 | 
656 | def read_archive_js(filepath):
657 |     "Open zip file, return (filename, content) for all .js"
658 |     zf = zipfile.ZipFile(filepath)
659 |     for zi in zf.filelist:
660 |         # Ignore files in a assets dir -- these are for Twitter's archive
661 |         # browser thingie -- and only use final filenames since some archives
662 |         # appear to put data in a data/ subdir, which can screw up the filename
663 |         # -> importer mapping.
664 |         if zi.filename.endswith(".js") and not zi.filename.startswith("assets/"):
665 |             yield pathlib.Path(zi.filename).name, zf.open(zi.filename).read()
666 | 
667 | 
668 | def extract_and_save_source(db, source):
669 |     if not source:
670 |         return None
671 |     m = source_re.match(source)
672 |     details = m.groupdict()
673 |     return db["sources"].insert(details, hash_id="id", replace=True).last_pk
674 | 
675 | 
676 | def save_user_counts(db, user):
677 |     for type_name, type_id in COUNT_HISTORY_TYPES.items():
678 |         previous_count = None
679 |         try:
680 |             previous_count = db.conn.execute(
681 |                 """
682 |                 select count from count_history
683 |                 where type = ? and user = ?
684 |                 order by datetime desc limit 1
685 |                 """,
686 |                 [type_id, user["id"]],
687 |             ).fetchall()[0][0]
688 |         except IndexError:
689 |             pass
690 |         current_count = user["{}_count".format(type_name)]
691 |         if current_count != previous_count:
692 |             db["count_history"].insert(
693 |                 {
694 |                     "type": type_id,
695 |                     "user": user["id"],
696 |                     "datetime": datetime.datetime.utcnow().isoformat().split(".")[0]
697 |                     + "+00:00",
698 |                     "count": current_count,
699 |                 },
700 |                 replace=True,
701 |             )
702 | 


--------------------------------------------------------------------------------