├── Gemfile
├── LICENSE
├── README.md
├── common
├── queries.rb
├── requester.rb
└── utilities.rb
├── config
└── .config.yaml
├── scripts
├── polling.rb
└── search.rb
└── searchtweets
└── search_tweets.rb
/Gemfile:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | source 'http://rubygems.org'
3 | gem "mysql2"
4 | gem "json"
5 | gem "activerecord-mysql2-adapter"
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # Ruby client for Twitter API v2 search endpoints
4 |
5 | Welcome to the main branch of the Ruby search client. This branch supports the [Twitter API v2 'recent' and 'all' search](https://developer.twitter.com/en/docs/twitter-api/tweets/search/introduction)
6 | only, and drops support for the premium and enterprise tiers.
7 |
8 | The 'recent' search endpoint provides Tweets from the past 7 days. The 'all' search endpoint, launched in January 2021 as part of the 'academic research' tier of Twitter API v2 access,
9 | provides access to all publicly avaialble Tweets posted since March 2006.
10 |
11 | To learn more about the Twitter academic research program, see this [Twitter blog post](https://blog.twitter.com/developer/en_us/topics/tips/2021/enabling-the-future-of-academic-research-with-the-twitter-api.html).
12 |
13 | If you are looking for the original version that works with premium and enterprise versions of search, head on over to the "enterprise-premium" branch.
14 |
15 | If you are already familiar with the 'labs' version/branch, that version has been deprecated and it's time to start using the Twitter API v2 version.
16 |
17 | ## Features
18 | + Supports [Twitter API v2 'recent' and 'all' search](https://developer.twitter.com/en/docs/twitter-api/tweets/search/introduction).
19 | + Command-line utility is pipeable to other tools (e.g., jq).
20 | + Automatically handles pagination of search results with specifiable limits. This enables users to define a *study period* of interest, and the search client code will manage however many requests are required to transverse that period, up to 100 Tweets at a time.
21 | + By default, the script writes Tweets to standard out, and can also write to files or return either a hash or JSON string.
22 | + Flexible usage within a Ruby program.
23 | + Supports "polling" use cases.
24 | + Supports the new v2 feature of selecting the object attributes of interest with the new `tweet.fields`, `user.fields`, `media.fields`, `place.fields`, and `poll.fields` request parameters. These parameter values are configured in the client YAML configuration file.
25 | + **New**: Now that there are two search endpoints, the endpoint you want to hit is specified in the `config.yaml` configuration file, with the `endpoint' key.
26 | + **Note:** the v2 search endpoints *do not* support the ```counts``` endpoint.
27 |
28 | ----------------
29 | Jump to:
30 |
31 | + [Overview](#overview)
32 | + [Getting started](#getting-started)
33 | + [Configuring client](#config)
34 | + [Setting credentials](#credentials)
35 | + [Configuration file](#config-file)
36 | + [Command-line arguments](#arguments)
37 | + [Example script commands](#example-calls)
38 | + [Running in 'polling' mode](#polling)
39 | + [Specifying search period start and end times](#specifying-times)
40 | + [Automating multiple queries](#queries)
41 | --------------------
42 |
43 | ## Overview
44 |
45 | This project includes two Ruby scripts (```search.rb``` and ```polling.rb```, both in the /scripts folder) that are written for the v2 search endpoints. These scripts demonstrate how to create an instance of this project's main ```SearchTweets``` class (implemented in searchtweets/search_tweets.rb) and ask it for data.
46 |
47 | These scripts are command-line driven and support the following features:
48 |
49 | + Supports flexible ways to specify the search *study period*. Your study period may be a week, and the example script manages the multiple requests needed to span that period. E.g., ```-s 7d``` specifies the past 7 days. ```-s 12h``` specifies 12 hours, and ```-s 90m``` specifies 90 minutes. Other patterns such as ```YYYY-MM-DD HH:mm```, standard Twitter ISO timestamps, and the legacy 'Gnip' ```YYYYMMDDhhmm``` pattern are also supported. If no ```start-time``` and ```end-time``` details are included, the endpoint defaults to the previous seven days, starting with the most recent Tweets, then going back through time one page at a time.
50 |
51 | + Supports a "polling" ```-p``` mode. Polling mode is a pattern where a request is made on an interval (defaults to every 10 minutes) Both scripts support polling:
52 | + search.rb: This script is designed to make a set of requests and quit. When in 'polling' mode, the script leaves a 'breadcrumb' files with the 'newest' Tweet ID in it. The next time the script runs, it references this 'newest_id.txt' file and asks for Tweets posted since that one, then quits. Designed to be entered as a crontab job.
53 | + polling.rb: This script is based on an endless loop that makes requests for new Tweets on an ```--poll-interval``` (in minutes) command-line argument.
54 |
55 | + Polling also supports 'backfills.' You can initiate a polling session that starts with a backfill period to retrieve Tweets from first, then begins polling for new data. When listening for a topic of interest, it's common to start off with some recent history.
56 |
57 | + Writes to files, standard out, or receives a JSON string or hash from the underlying ```SearchTweets``` class. When writing files, one file is written for every endpoint response. File names are based on query syntax, and are serialized.
58 |
59 | + The client can stop making requests after a specified number. If your search query and period match millions of Tweets that would require hundreds (or thousands) of requests, you can have the client stop after four requests by adding the ```-x 4``` argument.
60 |
61 | + Can manage an array of queries, making requests for each. These query files can be written in YAML or JSON.
62 |
63 | + Queries can be configured with ```tag``` strings, and these are injected into the returned Tweet JSON. Tags can be used to describe why Tweets were matched. If you are building a Tweet collection based on many queries, tags are useful for logically grouping Tweets.
64 |
65 | ### SearchTweets class
66 |
67 | The ```search.rb``` and ```polling.rb``` scripts both demonstrate creating an instance of the SearchTweets class and calling its ```get_data``` method.
68 |
69 | 1) Creating an instance of the TweetsSearch class.
70 | ```ruby
71 | oSearchClient = SearchTweets.new()
72 | ```
73 |
74 | 2) Calling its ```get_data``` method with a query and getting back an array of Tweets along with the ID of the most recent one returned.
75 |
76 | ```ruby
77 | tweet_array, newest_id = oSearchClient.get_data(query)
78 | ```
79 | ## Getting started
80 |
81 | Four fundamental steps need to be taken to start using this search client:
82 |
83 | 1) Establish access to the Twitter API v2 endpoints at 1) Establish access to the Twitter API v2 endpoints at https://developer.twitter.com/en/docs/labs/overview/whats-new
84 | 2) Obtain credentials for authenticating with the search endpoint. You'll need to create a developer App and generate a application/consumer key and secret. You can
85 | configure the scripts with either the consumer key and secret tokens or a Bearer Token that you have generated. (The v2 search endpoints uses Bearer Token authentication. If you use just the key and secret, the search client will generate the Bearer Token.) For more information, see our authentication documentation [HERE](https://developer.twitter.com/en/docs/basics/authentication/oauth-2-0).
86 | 3) Get this Ruby app running in your environment:
87 | + Clone respository.
88 | + Get gems installed with ```bundle install```. See project Gemfile. The client uses some basic gems like 'json' and 'yaml'. Test it out by running ```$ruby scripts/search.rb -h```. You should see a help menu.
89 | 4) Configure client. See below for more details.
90 | 5) Use command-line arguments to start making search requests (see examples below).
91 |
92 | **A few notes:**
93 |
94 | + Recent search supports queries up to 512 characters long.
95 | + See our [guide on creating search queries](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-rule).
96 | + If not request start and end times are specified, the endpoint defaults to that last 7 days, starting with the most recent Tweets, and paginating backwards through time.
97 | + For more information on the search endpoint that this client exercises, see our [API Reference[(https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent).
98 |
99 |
100 | ## Configuring client
101 |
102 | This client is configured with a combinaton of command-line arguments, environmental variables, and a YAML config file.
103 | This configuraton file defaults to ```./config/.config.yaml```, although you can specify a different path and name with the
104 | ```--config``` command-line argument.
105 |
106 | In general, command-line arguments are used to set the most frequently changed parameters, such as the query and the start and end times.
107 | Other parameters, such as the Tweet JSON fields of interest, can be set in the YAML file. Some configuation details, such as
108 | the output mode and maximum results per response, are setable by both command-line and YAML settings. If these settings are
109 | provided via the command-line, they will overwrite any setting made in the config file.
110 |
111 | ### Setting credentials
112 |
113 | Twitter endpoint credentials can be configured as *environmental variables* or set up in the YAML file.
114 |
115 | The search client first checks for environmental variables, and if not found there, it then looks in the YAML file.
116 |
117 | #### Setting credentials with environmental variables
118 |
119 | To set up your credentials environmental variables, use the following commands. You can set up either the ```TWITTER_CONSUMER_KEY```
120 | and ```TWITTER_CONSUMER_SECRET```values or just the ```TWITTER_BEARER_TOKEN``` value.
121 |
122 | ```bash
123 | export TWITTER_CONSUMER_KEY=N0TmYC0Nsum4Rk3Y
124 | export TWITTER_CONSUMER_SECRET=N0TmYC0Nsum4Rs3cR3t
125 | ```
126 |
127 | ```bash
128 | export TWITTER_BEARER_TOKEN=AAAAAAAAreallylongBearerT0k4n
129 | ```
130 |
131 | To have these environmental variables persist between terminal sessions, add these commands to your ~/.bash_profile (at least on Linux/Unix).
132 |
133 | #### Setting credentials in YAML configuration file
134 |
135 | A ```.config.yaml``` file is used to set script options, and optionally, endpoint credentials. By default, this file is assumed to be in a ```./config``` subfolder of the main project directory. You can store it somewhere else and use the ```--config``` argument to provide the file path.
136 |
137 | In the YAML file there is a ```auth:``` section. You can either set the ```consumer_key``` and ```consumer_token``` values, or the ```bearer_token``` value.
138 |
139 | ```yaml
140 | #Credentials.
141 | auth:
142 | consumer_key: N0TmYC0Nsum4Rk3Y
143 | consumer_secret: N0TmYC0Nsum4Rs3cR3t
144 | bearer_token: AAAAAAAAreallylongBearerT0k4n
145 | ```
146 |
147 | ## Setting client options in YAML configuration file
148 |
149 | This client works with both the 'recent' and 'all' search endpoints. As seen in the example `.config.yaml` file below,
150 | the endpoint you are working with is specified with the `endpoint` key.
151 |
152 | This version of search enables developers to fine-tune the details they want to include in the endpoint's responses, using [expansions](https://developer-staging.twitter.com/en/docs/twitter-api/expansions)
153 | and [fields](https://developer-staging.twitter.com/en/docs/twitter-api/fields). Since expansions and fields details can be
154 | very lengthy, these options are set in the YAML configuraion file. The example file below includes all the available options
155 | for expansions and fields. As you work with the client's output, you may decide to exclude objects and fields that you do
156 | not need.
157 |
158 | Other options configurable in the file include the maximum number of Tweets to include per 'page' of results, ```max_results```,
159 | and how the data is processed. If the ```write_mode``` is set to 'files', the ```out_box``` is set to where you want files
160 | to be written.
161 |
162 | If the ```write_mode``` is set to 'json' or 'hash', the ```max_tweets_in_returned_hash``` can be used to set a upper limit
163 | on the number of Tweets written to this one data structure. This client is designed to make as many requests as needed to
164 | retrieve every Tweet that matches your query and study period. Since that number of Tweets can be very large, this can be used
165 | to limit the amount of memory used to store the payload.
166 |
167 | ```yaml
168 | #Client options.
169 | options:
170 | endpoint: https://api.twitter.com/2/tweets/search/recent #Also: https://api.twitter.com/2/tweets/search/all
171 |
172 | #Default API request parameters.
173 | max_results: 50 #For v2 this max is 100. Default is 10.
174 |
175 | expansions: attachments.poll_ids,attachments.media_keys,author_id,entities.mentions.username,geo.place_id,in_reply_to_user_id,referenced_tweets.id,referenced_tweets.id.author_id
176 | tweet.fields: attachments,author_id,context_annotations,conversation_id,created_at,entities,geo,id,in_reply_to_user_id,lang,possibly_sensitive,promoted_metrics,public_metrics,referenced_tweets,source,text,withheld
177 | #If you are using user-context authentication, these Tweet field ise available for the authorizing user: non_public_metrics.
178 | #If that user is promoting Tweets with Twitter Ads, these Tweet fields are available: organic_metrics, promoted_metrics
179 | user.fields: created_at,description,entities,id,location,name,pinned_tweet_id,profile_image_url,protected,public_metrics,url,username,verified,withheld
180 | media.fields: duration_ms,height,media_key,preview_image_url,public_metrics,type,url,width
181 | place.fields: contained_within,country,country_code,full_name,geo,id,name,place_type
182 | poll.fields: duration_minutes,end_datetime,id,options,voting_status
183 |
184 | write_mode: so # options: json, files, so/standard/standard-out, hash --> Store Tweets in local files or print to system out?
185 | out_box: ./output # Folder where retrieved data goes.
186 | max_tweets_in_returned_hash: 10000
187 |
188 | ```
189 |
190 | ## Command-line arguments
191 |
192 | The ```search.rb``` and ```polling.rb``` example scripts support the following commands.
193 |
194 | ### Command-line options for ```search.rb``` script:
195 |
196 | ```
197 | Usage: search [options]
198 | -c, --config CONFIG Configuration file (including path) that provides account and option selections.
199 | Config file specifies which search api, includes credentials, and sets app options.
200 | -q, --query QUERY Maps to API "query" parameter. Either a single query passed in, or a file containing either a
201 | YAML or JSON array of queries/rules.
202 | -s, --start-time START UTC timestamp for beginning of Search period (maps to "fromDate").
203 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.
204 | -e, --end-time END UTC timestamp for ending of Search period (maps to "toDate").
205 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.
206 | -p, --poll Sets "polling" mode.
207 | -i, --since-id SINCEID All matching Tweets since this Tweet ID was created (exclusive).
208 | -u, --until-id UNTILID All matching Tweets up until this ID was created (exclusive).
209 | -m, --max MAXRESULTS Specify the maximum amount of Tweets results per response (maps to "max_results"). 10 to 100, defaults to 10.
210 |
211 | -x, --exit EXIT Specify the maximum amount of requests to make. "Exit app after this many requests."
212 | -w, --write WRITE 'files', 'standard-out' (or 'so' or 'standard').
213 | -o, --outbox OUTBOX Optional. Triggers the generation of files and where to write them.
214 | -t, --tag TAG Optional. Gets included in the payload if included. Alternatively, rules files can contain tags.
215 | -h, --help Display this screen.
216 | ```
217 |
218 | ### Command-line options for ```polling.rb``` script:
219 |
220 | ```
221 | Usage: search [options]
222 | -c, --config CONFIG Configuration file (including path) that provides account and option selections.
223 | Config file specifies which search endpoint, includes credentials, and sets app options.
224 | -q, --query QUERY Maps to API "query" parameter. Either a single query passed in, or a file containing either a
225 | YAML or JSON array of queries.
226 | -s, --start-time START UTC timestamp for beginning of Search period (maps to "fromDate").
227 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.
228 | -e, --end-time END UTC timestamp for ending of Search period (maps to "toDate").
229 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.
230 | -p, --poll Sets "polling" mode.
231 | -i, --since-id SINCEID All matching Tweets since this Tweet ID was created (exclusive).
232 | -u, --until-id UNTILID All matching Tweets up until this ID was created (exclusive).
233 | -m, --max MAXRESULTS Specify the maximum amount of Tweets results per response (maps to "max_results"). 10 to 100, defaults to 10.
234 | -x, --exit EXIT Specify the maximum amount of requests to make. "Exit app after this many requests."
235 | -w, --write WRITE 'files', 'hash', standard-out' (or 'so' or 'standard').
236 | -o, --outbox OUTBOX Optional. Triggers the generation of files and where to write them.
237 | -t, --tag TAG Optional. Gets included in the Tweet payload if included. Also, queries files can contain tags.
238 | -v, --verbose Optional. Turns verbose messaging on.
239 | -h, --help Display this screen.
240 |
241 | ```
242 |
243 | ## Example script commands
244 |
245 | Here are some example commands to help you get started with the Ruby search client:
246 |
247 | + Request all Tweets posted by the @TwitterDev account over the past 5 days:
248 | + ```$ruby search.rb -q "from:TwitterDev" -s 5d```
249 |
250 | + Request Tweets matching the specified rule, but stop after three requests. Set the search period to May 8, 2020 in the MDT (UTC−6:00) timezone. This example rule translates to "match Tweets with keyword 'spring' that have a photo, video, or GIF attached 'natively' with Twitter app."
251 | + ```$ruby search.rb -q "spring has:media" -s "2020-05-08 06:00" -e "2020-05-09 06:00" -x 3```
252 |
253 | + Request Tweets and receive a Ruby hash will all matching Tweets:
254 | + ```$ruby search.rb --query "spring has:media" --start-time 12h --write hash```
255 |
256 | + Request Tweets and have the client write responses to a specified folder:
257 | + ```$ruby search.rb --query "spring has:media" --start-time 12h --write files --outbox "./output"```
258 |
259 | + Make Requests using a YAML configuration file with a custom name and stored somewhere other than the default location (./config):
260 | + ```$ruby ./search.rb -c "~/configs/twitter/my_config.yaml" -q "snow has:videos -s 14d```
261 |
262 | ## Running in 'polling' mode
263 |
264 | The ```search.rb``` and ```polling.rb``` scripts both support a 'polling' mode. In this mode, the scripts are used to make "any new Tweets since I last asked?" requests on a user-specified interval. As that interval decreases, search endpoints can be used to collect Tweets in a near-real-time fashion. Polling mode depends on the ```since_id``` search request parameter. After collecting some Tweets, this parameter is set to the most recent (the largest) Tweet ID that has been received.
265 |
266 | Both example scripts implement a polling option, and in very different ways. One key difference it that the ```search.rb``` script depends on an external process to trigger the interval calls (e.g. setting up a crontab, or having a separate script that watches the clock), while the ```polling.rb``` script stays reesident and manages its own interval calls.
267 |
268 | ### Polling with ```search.rb```
269 |
270 | The ```search.rb``` script was originally built to manage requests across a *study period* of interest. Search endpoints return a relatively small amount of Tweets per response, and pagination is usually required to compile the Tweet collection of interest. The v2 search endpoints return 10 Tweets per response by default, and the client ```--max``` argument is avialable to adjust that up to the maximum number of 100 Tweets.
271 |
272 | The ```search.rb``` script now supports a ```--poll``` command-line argument. When this argument is included, the script knows to leave a 'breadcrumb' ```newest_id.txt``` file after it has completed its set of paginated requests. Search endpoints start with the most recent Tweets first, and paginate backwards through time. So the trick here, which the search client manages for you, is to persist the ```newest_id``` from the *first* request, regardless of how many requests were required to paginate and transverse your *study period*.
273 |
274 | When in polling mode, the ```search.rb``` script looks for this ```newest_id.txt``` file. If you are starting a new polling session, it's important to delete any existing ```newest_id.txt``` file. When starting without a ```newest_id.txt``` file, the ```search.rb``` script does its normal thing of making as many paginated requests as needed, then writes a new ```newest_id.txt``` file. When the script is run again, with the same command-line arguments as before, it finds the file and short-circuits to use the ```since_id``` request parameter in place of any ```start_time``` used for the first ```--poll``` run.
275 |
276 | As an example, the following call triggers the polling mode, and also asks for two days of backfill:
277 |
278 | ```$ruby search.rb --poll --query "(snow OR rain) colorado has:media" -s 2d```
279 |
280 | When this set of requests finishes, the ```newest_id.txt``` file is written. The next time the script runs, perhaps by a crontab entry, the above response is automatically updated to:
281 |
282 | ```$ruby search.rb --poll --query "(snow OR rain) colorado has:media" --since-id 1230653928645124097```
283 |
284 |
285 | ### Polling with ```polling.rb```
286 |
287 | The ```polling.rb``` was written to focus on polling, and operates in a completely different way. The ```polling.rb``` script internally runs an endless `while` loop and self-manages its polling timing based on the interval duration passed in by the user (and defaults to 5 minutes). To do this, the script times how long each set of requests makes, and adjustments accordingly to stay precisely on the interval.
288 |
289 | As an example, the following call sets up a polling session on a 30-second interval. This request starts off with a 72-hour backfill, completes that backfill, then starts making a request every 30 seconds:
290 |
291 | ```$ruby polling.rb --poll-interval 0.5 --query "(snow OR rain) colorado has:media" -s 72h```
292 |
293 | The ```polling.rb``` script will continue to run until the script is stopped.
294 |
295 |
296 | ## Specifying search period start and end times
297 |
298 | By default the recent search endpoint will search from the previous 7 days, and the 'all' endpoint will return 30 days. However, most search requests will have a more specific period of interest. With the search endpoints, the start of the search period is specified with the ```start_time``` parameter, and the end with ```end_time``` request parameter.
299 |
300 | Both timestamps assume the UTC timezone. If you are making search requests based on a local timezone, you'll need to convert these timestamps to UTC. These search APIs require these timestamps to have the 'YYYY-MM-DDTHH:mm:ssZ' format (ISO 8601/RFC 3339). As that format suggests, search request periods can have a second granularity.
301 |
302 | This client uses the 'start' and 'end' aliases for ```start_time``` and ```end_time``` parameters, and supports additional timestamp formats.
303 |
304 | Start ```-s``` and end ```-e``` parameters can be specified in a variety of ways:
305 |
306 | + A combination of an integer and a character indicating "days" (#d), "hours" (#h) or "minutes" (#m). Some examples:
307 | + -s 5d --> Start five days ago.
308 | + -s 6d -e 2d --> Start 6 days ago and end 2 days ago.
309 | + -s 6h --> Start six hours ago (i.e. Tweets from the last six hours).
310 |
311 | + YYYYMMDDHHmm (UTC)
312 | + -s 202005170700
313 | + -e 202005180700
314 |
315 | + "YYYY-MM-DD HH:mm" (UTC, use double-quotes please).
316 | + -s "2020-05-17 06:00" -e "2020-05-19 06:00" --> Tweets from between 2020-05-17 and 2020-05-19 MDT.
317 |
318 | + "YYYY-MM-DDTHH:MM:SS.000Z" (ISO 8061 timestamps as used by Twitter, in UTC).
319 | + -s 2020-05-17T15:00:00.000Z --> Tweets posted since 2020-05-17 09:00:00 MDT .
320 |
321 |
322 | ## Automating multiple queries
323 |
324 | The Search endpoint works with a single query at a time. This client supports making requests with multiple queries, managing the data retrieval for each individual rule.
325 |
326 | Multiple queries can be specified in JSON or YAML files. Below is an example of each.
327 |
328 | **JSON query file:
329 |
330 | ```json
331 | {
332 | "queries" :
333 | [
334 | {
335 | "value" : "snow colorado",
336 | "tag" : "ski_biz"
337 | },
338 | {
339 | "value" : "snow utah",
340 | "tag" : "ski_biz"
341 | },
342 | {
343 | "value" : "rain washington",
344 | "tag" : "umbrellas"
345 | }
346 | ]
347 | }
348 | ```
349 |
350 | **YAML query file:
351 |
352 | ```yaml
353 | queries:
354 | - value : "snow colorado"
355 | tag : ski_biz
356 | - value : "snow utah"
357 | tag : ski_biz
358 | - value : "rain washington"
359 | tag : umbrellas
360 | ```
361 |
362 | For example, you can pass in a JSON query file located at ./queries/my-snow-rules.json with the following argument:
363 |
364 | ```$ruby search.rb -r "./queries/my-snow-queries.json" -s 7d -m 100```
365 |
--------------------------------------------------------------------------------
/common/queries.rb:
--------------------------------------------------------------------------------
1 | require "json"
2 | require "yaml"
3 |
4 | #=======================================================================================================================
5 | class Queries
6 | attr_accessor :queries
7 |
8 | def initialize
9 | @queries = Array.new
10 | end
11 |
12 | #Methods for maintaining the rules array
13 | def add_query(value, tag=nil)
14 | #Gotta have a rule value, but tag is optional.
15 | query = Hash.new
16 | query[:value] = value
17 | if not tag.nil? then
18 | query[:tag] = tag
19 | end
20 | #Add rule to rules array.
21 | @queries << query
22 | end
23 |
24 | def delete_query(value) #No tag passed in, we remove with 'value' match.
25 | #Regardless of tag, tour rules Array and remove.
26 | @queries.each do |r|
27 | if r[:value] == value then
28 | @queries.delete(r)
29 | end
30 | end
31 | end
32 |
33 | #Methods for getting the rules in the structure you want ===========================================================
34 | def get_JSON
35 | queryPayload = Hash.new
36 | queryPayload[:queries] = @queries
37 | queryPayload.to_json
38 | end
39 |
40 | #Methods for reading queries from files ==============================================================================
41 |
42 | def load_query_yaml(file)
43 | #Open file and parse, looking for rule/tag pairs
44 | queryset = YAML.load_file(file)
45 | queries = queryset["queries"]
46 | queries.each do |query|
47 | #puts query
48 | @queries << query
49 | end
50 | end
51 |
52 | def load_query_json(file)
53 | #Open file and parse
54 | contents = File.read(file)
55 | queryset = JSON.parse(contents)
56 | queries = queryset["queries"]
57 | queries.each do |query|
58 | @queries << query
59 | end
60 | end
61 | end
62 |
--------------------------------------------------------------------------------
/common/requester.rb:
--------------------------------------------------------------------------------
1 | # A general object that knows how to make HTTP requests.
2 | # A simple, common RESTful HTTP class put together for Twitter RESTful endpoints.
3 | # Does authentication via header, so supports BEARER TOKEN authentication.
4 |
5 | #=======================================================================================================================
6 |
7 | class Requester
8 | require "net/https"
9 | require "uri"
10 |
11 | attr_accessor :url,
12 | :uri,
13 | :data,
14 | :headers, #i.e. Authentication specified here.
15 | :bearer_token,
16 | :request_count,
17 | :request_limit
18 |
19 | def initialize(url=nil, bearer_token=nil, headers=nil)
20 |
21 | if not url.nil?
22 | @url = url
23 | end
24 |
25 | if not headers.nil?
26 | @headers = headers
27 | end
28 |
29 | if not bearer_token.nil?
30 | @bearer_token = bearer_token
31 | end
32 |
33 | @request_count = 0
34 | @request_limit = nil #Not set by default. Parent object should make an informed decision.
35 |
36 | end
37 |
38 | def url=(value)
39 | @url = value
40 | @uri = URI.parse(@url)
41 | end
42 |
43 | #Fundamental REST API methods:
44 | def POST(data=nil)
45 |
46 | if not data.nil? #if request data passed in, use it.
47 | @data = data
48 | end
49 |
50 | uri = URI(@url)
51 | http = Net::HTTP.new(uri.host, uri.port)
52 | http.use_ssl = true
53 | request = Net::HTTP::Post.new(uri.path)
54 | request.body = @data
55 |
56 | request['Authorization'] = "Bearer #{@bearer_token}"
57 |
58 | if not @headers.nil?
59 | @headers.each do | key, value|
60 | request[key] = value
61 | end
62 | end
63 |
64 | begin
65 | response = http.request(request)
66 | rescue
67 | logger()
68 | sleep 5
69 | response = http.request(request) #try again
70 | end
71 |
72 | @request_count =+ 1
73 |
74 | return response
75 | end
76 |
77 | def PUT(data=nil)
78 |
79 | if not data.nil? #if request data passed in, use it.
80 | @data = data
81 | end
82 |
83 | uri = URI(@url)
84 | http = Net::HTTP.new(uri.host, uri.port)
85 | http.use_ssl = true
86 | request = Net::HTTP::Put.new(uri.path)
87 | request.body = @data
88 |
89 | request['Authorization'] = "Bearer #{@bearer_token}"
90 |
91 | begin
92 | response = http.request(request)
93 | rescue
94 | sleep 5
95 | response = http.request(request) #try again
96 | end
97 |
98 | @request_count =+ 1
99 |
100 | return response
101 | end
102 |
103 | def GET(params=nil)
104 | uri = URI(@url)
105 |
106 | #params are passed in as a hash.
107 | #Example: params["max"] = 100, params["since_date"] = 202005010000
108 | if not params.nil?
109 | uri.query = URI.encode_www_form(params)
110 | end
111 |
112 | http = Net::HTTP.new(uri.host, uri.port)
113 | http.use_ssl = true
114 | request = Net::HTTP::Get.new(uri.request_uri)
115 | request['Authorization'] = "Bearer #{@bearer_token}"
116 |
117 | if not @headers.nil?
118 | @headers.each do | key, value|
119 | request[key] = value
120 | end
121 | end
122 |
123 | begin
124 | response = http.request(request)
125 | rescue
126 | sleep 5
127 | response = http.request(request) #try again
128 | end
129 |
130 | @request_count =+ 1
131 |
132 | return response
133 | end
134 |
135 | def DELETE(data=nil)
136 | if not data.nil?
137 | @data = data
138 | end
139 |
140 | uri = URI(@url)
141 | http = Net::HTTP.new(uri.host, uri.port)
142 | http.use_ssl = true
143 | request = Net::HTTP::Delete.new(uri.path)
144 | request.body = @data
145 |
146 | request['Authorization'] = "Bearer #{@bearer_token}"
147 |
148 | begin
149 | response = http.request(request)
150 | rescue
151 | sleep 5
152 | response = http.request(request) #try again
153 | end
154 |
155 | @request_count =+ 1
156 |
157 | return response
158 | end
159 |
160 | #This method knows how to take app keys and generate a Bearer token.
161 | def get_bearer_token(consumer_key, consumer_secret)
162 | # Generates a Bearer Token using your Twitter App's consumer key and secret.
163 | # Calls the Twitter URL below and returns the Bearer Token.
164 | bearer_token_url = "https://api.twitter.com/oauth2/token"
165 |
166 | credentials = Base64.encode64("#{consumer_key}:#{consumer_secret}").gsub("\n", "")
167 |
168 | uri = URI(bearer_token_url)
169 | http = Net::HTTP.new(uri.host, uri.port)
170 | http.use_ssl = true
171 | request = Net::HTTP::Post.new(uri.path)
172 | request.body = "grant_type=client_credentials"
173 | request['Authorization'] = "Basic #{credentials}"
174 | request['User-Agent'] = "LabsRecentSearchQuickStartRuby"
175 |
176 | response = http.request(request)
177 |
178 | body = JSON.parse(response.body)
179 |
180 | body['access_token']
181 | end
182 | end #Requester class.
183 |
184 |
--------------------------------------------------------------------------------
/common/utilities.rb:
--------------------------------------------------------------------------------
1 | #Collection of general helper methods.
2 | #
3 | # So far, things like
4 | # + date object helpers.
5 | # + Support for a variety of timestamp formats.
6 | # + A few file-handling helpers:
7 | # + Writing/reading 'state' file.
8 | # + Checking for folder and creating it if it does not exist.
9 |
10 | module Utilities
11 |
12 | def Utilities.checkDirectory(directory) #Confirm a directory exists, creating it if necessary.
13 | #Make sure directory exists, making it if needed.
14 | if not File.directory?(directory)
15 | FileUtils.mkpath(directory) #logging and user notification.
16 | end
17 | directory
18 | end
19 |
20 | def Utilities.numeric?(object)
21 | true if Float(object) rescue false
22 | end
23 |
24 | # def Utilities.file_exists?(file_path)
25 | # return File.exist?(file_path)
26 | # end
27 |
28 | def Utilities.write_file(file_path, data)
29 |
30 | begin
31 | f = File.new(file_path, 'w')
32 | f.write(data)
33 | f.close
34 | rescue
35 | puts 'Error writing file...'
36 | return false
37 | end
38 |
39 | return true
40 | end
41 |
42 | def Utilities.read_id(file_path)
43 |
44 | file = File.open(file_path)
45 |
46 | data = file.read
47 |
48 | return data
49 |
50 |
51 | end
52 |
53 |
54 | #Date helpers.
55 |
56 | def Utilities.get_date_string(time)
57 | return time.year.to_s + sprintf('%02i', time.month) + sprintf('%02i', time.day) + sprintf('%02i', time.hour) + sprintf('%02i', time.min)
58 | end
59 |
60 | def Utilities.get_date_ISO_string(time)
61 | return "#{time.year.to_s}-#{sprintf('%02i', time.month)}-#{sprintf('%02i', time.day)}T#{sprintf('%02i', time.hour)}:#{sprintf('%02i', time.min)}:00Z"
62 | end
63 |
64 | def Utilities.get_date_object(time_string)
65 | time = Time.new
66 | time = Time.parse(time_string)
67 | return time
68 | end
69 |
70 | #Takes a variety of string inputs and returns a standard Twitter Labs timestamp string.
71 | def Utilities.set_date_string(input)
72 |
73 | now = Time.new
74 | date = Time.new
75 |
76 | #Handle minute notation.
77 | if input.downcase[-1] == "m"
78 | date = now.utc - (60 * input[0..-2].to_f)
79 | return get_date_ISO_string(date)
80 | end
81 |
82 | #Handle hour notation.
83 | if input.downcase[-1] == "h"
84 | date = now.utc - (60 * 60 * input[0..-2].to_f)
85 | return get_date_ISO_string(date)
86 | end
87 |
88 | #Handle day notation.
89 | if input.downcase[-1] == "d"
90 | date = now.utc - (24 * 60 * 60 * input[0..-2].to_f)
91 | #Double check if 7 days were passed in, and if so, add 60 seconds
92 | if input[0..-2] == '7'
93 | date = date + 60
94 | end
95 | return get_date_ISO_string(date)
96 | end
97 |
98 | #Handle premium/enterprise format, YYYYMMDDHHMM
99 | if input.length == 12 and numeric?(input)
100 | date = Time.new(input[0..3],input[4..5],input[6..7],input[8..9],input[10..11])
101 | return get_date_ISO_string(date)
102 | end
103 |
104 | #Handle "YYYY-MM-DD 00:00"
105 | if input.length == 16
106 | date = Time.new(input[0..3],input[5..6],input[8..9],input[11..12],input[14..15])
107 | return get_date_ISO_string(date)
108 | end
109 |
110 | #Handle ISO 8601 timestamps, as in Twitter payload "2013-11-15T17:16:42.000Z"
111 | if input.length > 16
112 | date = Time.parse(input)
113 | return get_date_ISO_string(date)
114 | end
115 |
116 | return 'Error, unrecognized timestamp.'
117 | end
118 | end
119 |
--------------------------------------------------------------------------------
/config/.config.yaml:
--------------------------------------------------------------------------------
1 | # Any corresponding command-line arguments override these. These establish default configuration details.
2 | # Rules/filters are specified via command-line, or with a rules JSON or YAML file.
3 |
4 | #Client options.
5 |
6 | # Any corresponding command-line arguments override these. This establish default configuration details.
7 | # Queries are specified via command-line, or with a JSON or YAML file.
8 |
9 | #Client options.
10 | options:
11 |
12 | endpoint: https://api.twitter.com/2/tweets/search/recent #Also: https://api.twitter.com/2/tweets/search/all
13 |
14 | #Default API request parameters.
15 | max_results: 10 #For Labs this max is 100. Default is 10.
16 |
17 | expansions: author_id,referenced_tweets.id,attachments.media_keys,geo.place_id
18 | tweet.fields: attachments,author_id,created_at,entities,geo,public_metrics,referenced_tweets,source
19 | user.fields: created_at,description,location,name,pinned_tweet_id,public_metrics,url,username,verified
20 | media.fields: media_key,public_metrics,type,url
21 | place.fields: country_code,full_name,geo,id,name,place_type
22 | poll.fields: options
23 |
24 | write_mode: so # options: json, files, so/standard/standard-out, hash --> Store Tweets in local files or print to system out?
25 | out_box: ./output # Folder where retrieved data goes.
26 | max_tweets_in_returned_hash: 10000
27 |
28 | #Credentials. Insert your Consumer Key and Secret, OR your Bearer Token, below.
29 | auth:
30 | #consumer_key: PutYourConsumerKeyHere
31 | #consumer_secret: PutYourConsumerSecretHere
32 | bearer_token: PutYourBearerTokenHere
33 |
--------------------------------------------------------------------------------
/scripts/polling.rb:
--------------------------------------------------------------------------------
1 | '''
2 |
3 | Manages Twitter search requests in a "polling" mode. In this mode, "any new Tweets since I last asked?" requests
4 | are made repeatedly on a polling frequency. As the polling interval decreases, the more this request pattern mimics
5 | real-time feeds. The default interval is 5 minutes.
6 |
7 | -------------------------------------------------------------------------------------------------------------------
8 | Example command-lines
9 |
10 | Poll every 5 minutes for original Tweets with mentions of keywords with media "attached."
11 | $ruby ./polling-api.rb -c "./SearchConfig.yaml" -p 5 -r "(snow OR hail OR rain) -is:retweet has:media"
12 |
13 | -------------------------------------------------------------------------------------------------------------------
14 | '''
15 |
16 | require_relative "../searchtweets/search_tweets.rb"
17 | require_relative "../common/utilities.rb"
18 |
19 | def check_query_and_set_defaults(oSearchTweets, query, start_time, since_id, max_results)
20 |
21 | #Provides initial "gate-keeping" on what we have been provided. Enough information to proceed?
22 |
23 | #We need to have at least one query.
24 | if !query.nil?
25 | #Rules file provided?
26 | extension = query.split(".")[-1]
27 | if extension == "yaml" or extension == "json"
28 | oSearchTweets.query_file = query
29 | if extension == "yaml" then
30 | oSearchTweets.queries.load_query_yaml(oSearchTweets.query_file)
31 | end
32 | if extension == "json"
33 | oSearchTweets.queries.load_query_json(oSearchTweets.query_file)
34 | end
35 | else
36 | query_hash = {}
37 | query_hash["value"] = query
38 | oSearchTweets.queries.queries << query_hash
39 | end
40 | else
41 | puts "Either a single query or a queries files is required. "
42 | puts "No query, quitting..."
43 | exit
44 | end
45 |
46 | #Everything else is option or can be driven by defaults.
47 |
48 | #start_date, defaults to NOW - 7 days by the Labs Recent search endpoint.
49 | #end_date, defaults to NOW (actually, third seconds before time of request).
50 | #
51 | # These time commandline arguments can be formated in several ways:
52 | # These can be specified on command-line in several formats:
53 | # YYYYMMDDHHmm or ISO YYYY-MM-DD HH:MM.
54 | # 14d = 14 days, 48h = 48 hours, 360m = 6 hours
55 | # Or they can be in the queries file (and overridden on the command-line).
56 |
57 | #Handle start date.
58 | #First see if it was passed in
59 | if !start_time.nil?
60 | oSearchTweets.start_time_study = Utilities.set_date_string(start_time)
61 | end
62 |
63 | #Any defaults for these?
64 | if !since_id.nil?
65 | oSearchTweets.since_id = since_id
66 | end
67 |
68 | #Max results is optional, defaults to 10 by Labs Recent search.
69 | if !max_results.nil?
70 | oSearchTweets.max_results = max_results
71 | end
72 | end
73 |
74 | def set_app_configuration(oSearchTweets, exit_after, write, outbox, tag, verbose)
75 |
76 | #Tag is completely optional.
77 | if !tag.nil?
78 | query = {}
79 | query = oSearchTweets.queries.queries
80 | query[0]["tag"] = tag
81 | end
82 |
83 | #Tag is completely optional.
84 | if !verbose.nil?
85 | oSearchTweets.verbose = true
86 | end
87 |
88 | #Supports ability to set a maximum number of (pagination) requests.
89 | if !exit_after.nil?
90 | oSearchTweets.exit_after = exit_after.to_i
91 | end
92 |
93 | #Handle 'write' option
94 | if !write.nil?
95 | oSearchTweets.write_mode = write
96 |
97 | if oSearchTweets.write_mode == "standard_out" or oSearchTweets.write_mode == "standard" or oSearchTweets.write_mode == "so"
98 | oSearchTweets.write_mode = "standard_out"
99 | end
100 | end
101 |
102 | #Writing data to files.
103 | if !outbox.nil?
104 | oSearchTweets.out_box = outbox
105 | oSearchTweets.write_mode = "files" #Setting an outbox overrides the write_mode....
106 | end
107 |
108 | end
109 |
110 | def request_summary(query, start_time, since_id)
111 |
112 | puts "Searching with query: #{query}"
113 |
114 | if !start_time.nil?
115 | puts "Backfilling Tweets since #{start_time}..."
116 | elsif !since_id.nil?
117 | puts "Retrieving Tweets since ID #{since_id}..." unless since_id.nil?
118 | end
119 | end
120 |
121 | #=======================================================================================================================
122 | if __FILE__ == $0 #This script code is executed when running this file.
123 |
124 | require 'optparse'
125 | require 'base64'
126 |
127 |
128 | #Defines the UI for the user. Albeit a simple command-line interface.
129 | OptionParser.new do |o|
130 |
131 | #Passing in a config file.... Or you can set a bunch of parameters.
132 | o.on('-c CONFIG', '--config', 'Configuration file (including path) that provides account and option selections.
133 | Config file specifies which search endpoint, includes credentials, and sets app options.') { |config| $config = config}
134 |
135 | #Search query. This can be a single query ""this exact phrase\" OR keyword"
136 | o.on('-q QUERY', '--query', 'Maps to API "query" parameter. Either a single query passed in, or a file containing either a
137 | YAML or JSON array of queries/rules.') {|query| $query = query}
138 |
139 |
140 | #Period of search. Defaults to end = Now(), start = Now() - 30.days.
141 | o.on('-s START', '--start-time', 'UTC timestamp for beginning of Search period (maps to "fromDate").
142 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.') { |start_time| $start_time = start_time}
143 | o.on('-i SINCEID', '--since_id', 'All matching Tweets since this Tweet ID was created (exclusive).') {|since_id| $since_id = since_id}
144 |
145 | o.on('-p POLLINTERVAL', '--poll-interval', 'Polling interval in minutes. Default is 5 minutes.') {|interval| $interval = interval}
146 |
147 | o.on('-m MAXRESULTS', '--max', 'Specify the maximum amount of Tweets results per response (maps to "max_results"). 10 to 100, defaults to 10.') {|max_results| $max_results = max_results} #... as in look before you leap.
148 |
149 | o.on('-x EXIT', '--exit', 'Specify the maximum amount of requests to make. "Exit app after this many requests."') {|exit_after| $exit_after = exit_after}
150 |
151 | o.on('-w WRITE', '--write',"'files', 'standard-out' (or 'so' or 'standard').") {|write| $write = write}
152 | o.on('-o OUTBOX', '--outbox', 'Optional. Triggers the generation of files and where to write them.') {|outbox| $outbox = outbox}
153 |
154 | #Tag: Not in payload, but triggers a "matching_rules" section with query tag values.
155 | o.on('-t TAG', '--tag', 'Optional. Gets included in the payload if included. Alternatively, rules files can contain tags.') {|tag| $tag = tag}
156 |
157 | o.on('-v', '--verbose', 'Optional. Turns verbose messaging on.') {|verbose| $verbose = verbose}
158 |
159 | #Help screen.
160 | o.on( '-h', '--help', 'Display this screen.' ) do
161 | puts o
162 | exit
163 | end
164 |
165 | o.parse!
166 | end
167 |
168 | #Create a Tweet Search object.
169 | oSearchTweets = SearchTweets.new()
170 |
171 | oSearchTweets.queries.queries = Array.new # Ability to handle arrays of queries is baked in at a low level ;)
172 |
173 | #Provided config file, which can provide auth, URL metadata, and app options.
174 | if $config.nil?
175 | $config = File.expand_path("../config/.config.yaml", __dir__) #Set default.
176 | end
177 |
178 | if !File.exists?($config)
179 | puts "Can not find configuration file. Quitting."
180 | exit
181 | end
182 |
183 | if $interval.nil?
184 | $interval = 5
185 | end
186 |
187 | oSearchTweets.get_system_config($config) #Anything on command-line overrides configuration setting...
188 | oSearchTweets.set_requester #With config details, set the HTTP stage for making requests.
189 |
190 | #Validate request and commands. #So, we got what we got from the config file, so process what was passed in.
191 | check_query_and_set_defaults(oSearchTweets, $query, $start_time, $since_id, $max_results)
192 | set_app_configuration(oSearchTweets, $exit_after, $write, $outbox, $tag, $verbose)
193 |
194 | #Wow, we made it all the way through that! Documentation must be awesome...
195 | request_summary($query, $start_time, $since_id)
196 |
197 | polling_interval = $interval.to_f * 60
198 | newest_id = 0
199 |
200 | #Start making requests and keep doing that until this script is stopped...
201 | while true
202 |
203 | start_request = Time.now
204 |
205 | #Start requesting data...
206 | tweet_array = []
207 | oSearchTweets.queries.queries.each do |query|
208 | puts "Getting activities for query: #{query["value"]}" if oSearchTweets.verbose
209 | tweet_array, newest_id = oSearchTweets.get_data(query["value"], oSearchTweets.start_time_study, oSearchTweets.end_time_study, oSearchTweets.since_id, oSearchTweets.until_id)
210 | end
211 |
212 | #Finished making requests for this polling interval.
213 | oSearchTweets.since_id = newest_id
214 | oSearchTweets.start_time_study = nil
215 |
216 | #returning dictionary or JSON string.
217 | if oSearchTweets.write_mode == 'hash' or oSearchTweets.write_mode == 'json'
218 | puts tweet_array.to_json if oSearchTweets.verbose
219 | puts "Received #{tweet_array.length} Tweets..." if oSearchTweets.verbose
220 | puts "Building a polling client that works with 'hash' or 'JSON' output? This is where you process that data..." if oSearchTweets.verbose
221 | end
222 |
223 | request_duration = Time.now - start_request
224 | puts "Polling again in #{'%.1f' %[polling_interval - request_duration,1].max} seconds..."
225 | sleep (polling_interval - request_duration)
226 | end
227 | end
228 |
--------------------------------------------------------------------------------
/scripts/search.rb:
--------------------------------------------------------------------------------
1 | '''
2 |
3 | Provides a command-line driven wrapper around the Twitter API v2 recent search endpoint.
4 |
5 | Originally built for premium and enterprise tiers of search, now updated to the Twitter API v2 recent search endpoint.
6 |
7 | -------------------------------------------------------------------------------------------------------------------
8 | This script/app is a command-line wrapper to search-tweets.rb, the SearchTweets class. The code here focuses on parsing
9 | command-line options, loading configuration details, and then calling get_data or get_counts methods.
10 |
11 | * Uses the optparse gem for parsing command-line options.
12 | * Currently loads all configuration details from a .config.yaml file
13 | * A next step could be to load in authentication keys via local environment vars.
14 |
15 | This app currently has no logging, and instead just "puts" statements to system out. The SearchTweets class includes a
16 | @verbose attribute that control the level of chatter.
17 |
18 | One query can be passed in via the command-line (most common method), or a file path can be provided which contains a
19 | query array in JSON or yaml.
20 | Loads up queries, and loops through them. At least one query is required.
21 | Writes to standard-out or files.
22 |
23 | -------------------------------------------------------------------------------------------------------------------
24 | Example command-lines
25 |
26 | #Pass in two files, the SearchTweets app config file and a Rules file.
27 | # $ruby ./search.rb -c "./config/..config.yaml" -q "./queries/myQueries.yaml"
28 | # $ruby ./search.rb -c "./config/..config.yaml" -q "./queries/myQueries.json"
29 |
30 | #Typical command-line usage.
31 | # Passing in single query and ISO formatted dates. Otherwise running with defaults.
32 | # $ruby ./search.rb -q "(snow OR weather) (colorado OR #COWX)" -s "2020-01-06T17:00:00Z" -e "2020-01-10T17:00:00Z"
33 | -------------------------------------------------------------------------------------------------------------------
34 | '''
35 |
36 | require_relative "../searchtweets/search_tweets.rb"
37 | require_relative "../common/utilities.rb"
38 |
39 | def check_query_and_set_defaults(oSearchTweets, query, start_time, end_time, since_id, until_id, max_results)
40 |
41 | #Provides initial "gate-keeping" on what we have been provided. Enough information to proceed?
42 |
43 | #We need to have at least one query.
44 | if !query.nil?
45 | #Queries file provided?
46 | extension = query.split(".")[-1]
47 | if extension == "yaml" or extension == "json"
48 | oSearchTweets.query_file = query
49 | if extension == "yaml" then
50 | oSearchTweets.queries.load_query_yaml(oSearchTweets.query_file)
51 | end
52 | if extension == "json"
53 | oSearchTweets.queries.load_query_json(oSearchTweets.query_file)
54 | end
55 | else
56 | query_hash = {}
57 | query_hash["value"] = query
58 | oSearchTweets.queries.queries << query_hash
59 | end
60 | else
61 | puts "Either a single query or a queries files is required. "
62 | puts "No query, quitting..."
63 | exit
64 | end
65 |
66 | #Everything else is option or can be driven by defaults.
67 |
68 | #start_date, defaults to NOW - 7 days by Recent search endpoint.
69 | #end_date, defaults to NOW (actually, third seconds before time of request).
70 | # OK, accepted parameters gets a bit fancy here.
71 | # These can be specified on command-line in several formats:
72 | # YYYYMMDDHHmm or ISO YYYY-MM-DD HH:MM.
73 | # 14d = 14 days, 48h = 48 hours, 360m = 6 hours
74 | # Or they can be in the queries file (but overridden on the command-line).
75 | # start_date < end_date, and end_date <= NOW.
76 |
77 | #Handle start date.
78 | #First see if it was passed in
79 | if !start_time.nil?
80 | oSearchTweets.start_time_study = Utilities.set_date_string(start_time)
81 | end
82 |
83 | #Handle end date.
84 | #First see if it was passed in
85 | if !end_time.nil?
86 | oSearchTweets.end_time_study = Utilities.set_date_string(end_time)
87 | end
88 |
89 | #Any defaults for these?
90 | if !since_id.nil?
91 | oSearchTweets.since_id = since_id
92 | end
93 |
94 | #Any defaults for these?
95 | if !until_id.nil?
96 | oSearchTweets.until_id = until_id
97 | end
98 |
99 | #Max results is optional, defaults to 10 by Labs Recent search.
100 | if !max_results.nil?
101 | oSearchTweets.max_results = max_results
102 | end
103 |
104 | end
105 |
106 | def set_app_configuration(oSearchTweets, exit_after, write, outbox, tag, verbose)
107 |
108 | #Tag is completely optional.
109 | if !tag.nil?
110 | query = {}
111 | query = oSearchTweets.queries.queries
112 | query[0]["tag"] = tag
113 | end
114 |
115 | #Tag is completely optional.
116 | if !verbose.nil?
117 | oSearchTweets.verbose = true
118 | end
119 |
120 | #Supports ability to set a maximum number of (pagination) requests.
121 | if !exit_after.nil?
122 | oSearchTweets.exit_after = exit_after.to_i
123 | end
124 |
125 | #Handle 'write' option
126 | if !write.nil?
127 | oSearchTweets.write_mode = write
128 |
129 | if oSearchTweets.write_mode == "standard_out" or oSearchTweets.write_mode == "standard" or oSearchTweets.write_mode == "so"
130 | oSearchTweets.write_mode = "standard_out"
131 | end
132 | end
133 |
134 | #Writing data to files.
135 | if !outbox.nil?
136 | oSearchTweets.out_box = outbox
137 | oSearchTweets.write_mode = "files" #Setting an outbox overrides the write_mode....
138 | end
139 |
140 | end
141 |
142 | def request_summary(query, start_time, end_time, since_id, until_id)
143 |
144 | puts "Searching with query: #{query}"
145 |
146 | if start_time.nil? and end_time.nil? and (!since_id.nil? or !until_id.nil?)
147 | puts "Retrieving data since Tweet ID #{since_id}..." unless since_id.nil?
148 | puts "Retrieving data up until Tweet ID #{until_id}..." unless until_id.nil?
149 | else
150 | time_span = "#{start_time} to #{end_time}. "
151 | if start_time.nil? and end_time.nil?
152 | time_span = "last 7 days."
153 | elsif start_time.nil?
154 | time_span = "7 days ago to #{end_time}. "
155 | elsif end_time.nil?
156 | time_span = "#{start_time} to now. "
157 | end
158 |
159 | puts "Retrieving data from #{time_span}..."
160 |
161 | end
162 | end
163 |
164 | #=======================================================================================================================
165 | if __FILE__ == $0 #This script code is executed when running this file.
166 |
167 | require 'optparse'
168 | require 'base64'
169 |
170 | #Defines the UI for the user. Albeit a simple command-line interface.
171 | OptionParser.new do |o|
172 |
173 | #Passing in a config file.... Or you can set a bunch of parameters.
174 | o.on('-c CONFIG', '--config', 'Configuration file (including path) that provides account and option selections.
175 | Config file specifies which search endpoint, includes credentials, and sets app options.') { |config| $config = config}
176 |
177 | o.on('-q QUERY', '--query', 'Maps to API "query" parameter. Either a single query passed in, or a file containing either a
178 | YAML or JSON array of queries.') {|query| $query = query}
179 |
180 |
181 | #Period of search. Defaults to end = Now(), start = Now() - 30.days.
182 | o.on('-s START', '--start-time', 'UTC timestamp for beginning of Search period (maps to "fromDate").
183 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.') { |start_time| $start_time = start_time}
184 | o.on('-e END', '--end-time', 'UTC timestamp for ending of Search period (maps to "toDate").
185 | Specified as YYYYMMDDHHMM, \"YYYY-MM-DD HH:MM\", YYYY-MM-DDTHH:MM:SS.000Z or use ##d, ##h or ##m.') { |end_time| $end_time = end_time}
186 |
187 | o.on('-p', '--poll', 'Sets "polling" mode.') {|poll| $poll = poll}
188 |
189 | o.on('-i SINCEID', '--since-id', 'All matching Tweets since this Tweet ID was created (exclusive).') {|since_id| $since_id = since_id}
190 | o.on('-u UNTILID', '--until-id', 'All matching Tweets up until this ID was created (exclusive).') {|until_id| $until_id = until_id}
191 |
192 | o.on('-m MAXRESULTS', '--max', 'Specify the maximum amount of Tweets results per response (maps to "max_results"). 10 to 100, defaults to 10.') {|max_results| $max_results = max_results} #... as in look before you leap.
193 |
194 | o.on('-x EXIT', '--exit', 'Specify the maximum amount of requests to make. "Exit app after this many requests."') {|exit_after| $exit_after = exit_after}
195 |
196 | o.on('-w WRITE', '--write',"'files', 'hash', standard-out' (or 'so' or 'standard').") {|write| $write = write}
197 | o.on('-o OUTBOX', '--outbox', 'Optional. Triggers the generation of files and where to write them.') {|outbox| $outbox = outbox}
198 |
199 | #Tag: Not in payload, but triggers a "matching_query" section with query tag values.
200 | o.on('-t TAG', '--tag', 'Optional. Gets included in the Tweet payload if included. Also, queries files can contain tags.') {|tag| $tag = tag}
201 |
202 | o.on('-v', '--verbose', 'Optional. Turns verbose messaging on.') {|verbose| $verbose = verbose}
203 |
204 | #Help screen.
205 | o.on( '-h', '--help', 'Display this screen.' ) do
206 | puts o
207 | exit
208 | end
209 |
210 | o.parse!
211 | end
212 |
213 | #Create a Tweet Search object.
214 | oSearchTweets = SearchTweets.new()
215 |
216 | oSearchTweets.queries.queries = Array.new # Ability to handle arrays of queries is baked in at a low level ;)
217 |
218 | #Provided config file, which can provide auth, URL metadata, and app options.
219 | if $config.nil?
220 | $config = File.expand_path("../config/.config.yaml", __dir__) #Set default.
221 | end
222 |
223 | if !File.exists?($config)
224 | puts "Can not find configuration file. Quitting."
225 | exit
226 | end
227 |
228 | oSearchTweets.get_system_config($config) #Anything on command-line overrides configuration setting...
229 |
230 | #Adding polling details...
231 | newest_id = '0'
232 |
233 | newest_id_file = './newest_id.txt' #'./config/polling.txt'
234 |
235 | oSearchTweets.set_requester #With config details, set the HTTP stage for making requests.
236 | if $poll
237 | puts "Polling request"
238 |
239 | if File.exists?(newest_id_file)
240 | newest_id = Utilities.read_id(newest_id_file).to_s
241 | end
242 | end
243 |
244 | if !$since_id.nil?
245 | puts "Polling request with since_id"
246 | end
247 |
248 | if newest_id != '' and newest_id != '0'
249 | $since_id = newest_id
250 | $start_time = nil
251 | $end_time = nil
252 | end
253 |
254 | #Validate request and commands. #So, we got what we got from the config file, so process what was passed in.
255 | check_query_and_set_defaults(oSearchTweets, $query, $start_time, $end_time, $since_id, $until_id, $max_results) #TODO: add polling?
256 | set_app_configuration(oSearchTweets, $exit_after, $write, $outbox, $tag, $verbose)
257 |
258 | #Wow, we made it all the way through that! Documentation must be awesome...
259 | request_summary($query, $start_time, $end_time, $since_id, $until_id)
260 |
261 | #Start requesting data...
262 | tweet_array = []
263 | includes_array = []
264 | oSearchTweets.queries.queries.each do |query|
265 | puts "Getting activities for query: #{query["value"]}" if oSearchTweets.verbose
266 | tweet_array, includes_array, newest_id = oSearchTweets.get_data(query["value"], oSearchTweets.start_time_study, oSearchTweets.end_time_study, oSearchTweets.since_id, oSearchTweets.until_id)
267 | end
268 |
269 | #Finished making requests...
270 | Utilities.write_file(newest_id_file, newest_id) if newest_id.to_i > 0 if $poll
271 |
272 | #returning hash or JSON string.
273 | if oSearchTweets.write_mode == 'hash' or oSearchTweets.write_mode == 'json'
274 | puts tweet_array.to_json if oSearchTweets.verbose
275 | puts "Received #{tweet_array.length} Tweets..." if oSearchTweets.verbose
276 | puts "Building a polling client that works with 'hash' or 'JSON' output? This is where you process that data..." if oSearchTweets.verbose
277 | end
278 |
279 | puts "Exiting..."
280 | end
281 |
--------------------------------------------------------------------------------
/searchtweets/search_tweets.rb:
--------------------------------------------------------------------------------
1 | #A singleton that knows how to make requests to the Twitter Developer Labs Recent search endpoint.
2 | #
3 | # Example usage:
4 | # require_relative "../searchtweets/tweets-search.rb"
5 | #
6 | # oSearchClient = TweetsSearch.new()
7 | # tweet_array, newest_id = oSearchClient.get_data(query["value"])
8 |
9 |
10 | class SearchTweets
11 |
12 | require 'json'
13 | require 'yaml' #Used for configuration files.
14 | require 'base64' #Needed if managing encrypted passwords.
15 | require 'fileutils'
16 | require 'time'
17 |
18 | #Common classes
19 | require_relative '../common/requester'
20 | require_relative '../common/queries'
21 | require_relative '../common/utilities.rb' #Mixin code.
22 |
23 | MAX_RESULTS_LIMIT = 100 #Limit on the number of Tweet IDs per API request, can be overridden.
24 |
25 | attr_accessor :tweets, #An array of Tweet JSON objects.
26 | :includes, #A hash of 'includes' object arrays.
27 |
28 | :verbose, #This code is chatty when true, mute with false.
29 |
30 | :request_count, #Tracks how many requests have been made.
31 | :tweet_count, #Tracks how many Tweets have been returned (Labs only).
32 | :request_timestamp, #Used for self-throttling of request rates.
33 | :first_request, #Polling mode triggers some 'special' first-run logic.
34 |
35 | :start_time_study, #'Study' period. Future versions will likely support periods longer than 7 days.
36 | :end_time_study,
37 | :newest_id_study,
38 | :oldest_id_study,
39 |
40 | # Search request parameters
41 | :queries, #An array of queries.
42 | :start_time,
43 | :end_time,
44 |
45 | :since_id,
46 | :until_id,
47 | :max_results,
48 |
49 | :expansions,
50 |
51 | :fields,
52 |
53 | :auth, #Keys or Bearer-token from YAML file.
54 |
55 | #Helper objects.
56 | :requester, #Object that knows RESTful HTTP requests.
57 | :url_data, #Search uses two different end-points...
58 |
59 | :exit_after, #Supports user option to quit after x requests.
60 | #:request_start_time, #May be breaking up 'study' period into separate smaller periods.
61 | #:request_end_time,
62 |
63 | #Query details. This client can load an array of queries from files (YAML or JSON)
64 | :query_file, #YAML (or JSON) file with queries.
65 |
66 | #This client can write to standard-out, files, and soon data stores...
67 | :write_rules,
68 | :write_mode, #files, standard out, hash
69 | :in_box,
70 | :out_box
71 |
72 |
73 | def initialize()
74 |
75 | #Override and/or add to defaults
76 | @tweet_fields = "id,created_at,author_id,text" #Adding created_at and author_id to Labs v2 defaults.
77 | #Other objects need things added to Twitter defaults? Want to set a @expansions default?
78 |
79 | @tweets = []
80 |
81 | @includes = {}
82 | @includes['tweets'] = []
83 | @includes['users'] = []
84 | @includes['media'] = []
85 | @includes['places'] = []
86 | @includes['polls'] = []
87 | @includes['errors'] = []
88 |
89 | @fields = {}
90 | @fields['tweet'] = ''
91 | @fields['user'] = ''
92 | @fields['media'] = ''
93 | @fields['place'] = ''
94 | @fields['poll'] = ''
95 |
96 | @auth = {} #Hash for authentication keys, secrets, and tokens.
97 |
98 | #Defaults.
99 | @max_results = MAX_RESULTS_LIMIT
100 | @exit_after = nil #Can be set to 'nil' to not limit requests.
101 | @out_box = './outbox'
102 | @write_mode = 'standard_out' #Client defaults to writing output to standard out.
103 |
104 | #Helper objects, singletons.
105 | @requester = Requester.new #HTTP helper class.
106 | @queries = Queries.new #Can load queries from configuration files.
107 |
108 | @request_count = 0
109 | @tweet_count = 0
110 | @request_timestamp = Time.now - 1 #Used to self-throttle requests. Running script generates at least one request.
111 |
112 | @verbose = false
113 | end
114 |
115 | #Load in the configuration file details, setting many object attributes.
116 | def get_system_config(config_file)
117 |
118 | config = YAML.load_file(config_file)
119 |
120 | @url_data = config['options']['endpoint']
121 |
122 | #TODO: Update README to match these updates:
123 |
124 | #First, for authentication, look at ENV settings and see if these are set.
125 | bearer_token = ENV['TWITTER_BEARER_TOKEN']
126 | consumer_key = ENV['TWITTER_CONSUMER_KEY']
127 | consumer_secret = ENV['TWITTER_CONSUMER_SECRET']
128 |
129 | if bearer_token.nil?
130 | if not consumer_key.nil? and not consumer_secret.nil?
131 | @auth[:bearer_token] = @requester.get_bearer_token(consumer_key, consumer_secret)
132 | end
133 | else
134 | @auth[:bearer_token] = bearer_token
135 | end
136 |
137 | #If not Bearer Token, then config_file is last chance.
138 | if @auth[:bearer_token].nil? or @auth[:bearer_token] == ''
139 | #Look in confile_file
140 | bearer_token = config['auth']['bearer_token'] #Required.
141 | consumer_key = config['auth']['consumer_key']
142 | consumer_secret = config['auth']['consumer_secret']
143 |
144 | if bearer_token == nil or bearer_token == ''
145 | @auth[:bearer_token] = @requester.get_bearer_token( consumer_key, consumer_secret )
146 | else
147 | @auth[:bearer_token] = bearer_token
148 | end
149 |
150 | end
151 |
152 | if !config['headers'].nil?
153 | @headers = config['headers']
154 | end
155 |
156 | @max_results = config['options']['max_results']
157 |
158 | @expansions = config['options']['expansions']
159 |
160 | #Load in object fields.
161 | @fields['tweet'] = config['options']['tweet.fields']
162 | @fields['user'] = config['options']['user.fields']
163 | @fields['media'] = config['options']['media.fields']
164 | @fields['place'] = config['options']['place.fields']
165 | @fields['poll'] = config['options']['poll.fields']
166 |
167 | #Support shorthands for different formats.
168 | @write_mode = config['options']['write_mode']
169 | @write_mode = 'standard_out' if @write_mode == 'so'
170 |
171 | #Handle outbox options.
172 | begin
173 | @out_box = Utilities.checkDirectory(config['options']['out_box'])
174 | rescue
175 | @out_box = './outbox'
176 | end
177 |
178 | end
179 |
180 | def set_requester
181 |
182 | @requester.bearer_token = @auth[:bearer_token] #Set the info needed for authentication.
183 | @requester.headers = @headers
184 |
185 | #Default to the "data" url.
186 | @requester.url = @url_data #Pass the URL to the HTTP object.
187 | end
188 |
189 | def get_search_rules
190 | if !@query_file.nil #TODO: Add JSON option.
191 | @queries.load_query_yaml(@query_file)
192 | end
193 | end
194 |
195 | def get_file_name(query, results)
196 |
197 | time_first = Time.parse(results.first['created_at'])
198 | time_last = Time.parse(results.first['created_at'])
199 |
200 | start_time = time_first.year.to_s + sprintf('%02i', time_first.month) + sprintf('%02i', time_first.day) + sprintf('%02i', time_first.hour) + sprintf('%02i', time_first.min) + sprintf('%02i', time_first.sec)
201 | end_time = time_last.year.to_s + sprintf('%02i', time_last.month) + sprintf('%02i', time_last.day) + sprintf('%02i', time_last.hour) + sprintf('%02i', time_last.min) + sprintf('%02i', time_last.sec)
202 |
203 | query_str = query.gsub(/[^[:alnum:]]/, "")[0..9]
204 | filename = "#{query_str}_#{start_time}_#{end_time}"
205 |
206 | return filename
207 | end
208 |
209 | def set_request_range(start_time = nil, end_time = nil, since_id = nil, until_id = nil)
210 | request = {}
211 |
212 | if !start_time.nil?
213 | request[:start_time] = start_time
214 | end
215 |
216 | if !end_time.nil?
217 | request[:end_time] = end_time
218 | end
219 |
220 | if not since_id.nil?
221 | request[:since_id] = since_id
222 | end
223 |
224 | if not until_id.nil?
225 | request[:until_id] = until_id
226 | end
227 |
228 | request
229 | end
230 |
231 | def build_data_request(query, start_time = nil, end_time = nil, since_id = nil, until_id = nil, max_results = nil, expansions = nil, fields = nil, next_token = nil)
232 |
233 | request = set_request_range(start_time, end_time, since_id, until_id)
234 |
235 | request[:query] = query
236 |
237 | request[:expansions] = expansions
238 |
239 | #Handle JSOPN fields.
240 | if fields.key?('tweet')
241 | request['tweet.fields'] = fields['tweet']
242 | end
243 | if fields.key?('user')
244 | request['user.fields'] = fields['user']
245 | end
246 | if fields.key?('media')
247 | request['media.fields'] = fields['media']
248 | end
249 | if fields.key?('place')
250 | request['place.fields'] = fields['place']
251 | end
252 | if fields.key?('poll')
253 | request['poll.fields'] = fields['poll']
254 | end
255 |
256 | if !max_results.nil?
257 | request[:max_results] = max_results
258 | else
259 | request[:max_results] = @max_results
260 | end
261 |
262 | if !next_token.nil?
263 | request[:next_token] = next_token
264 | end
265 |
266 | request
267 | end
268 |
269 | def write_standard_out(api_response)
270 |
271 | if api_response.key?('data')
272 | puts "Matching Tweets:"
273 | results = api_response['data']
274 | results.each do |tweet|
275 | puts tweet.to_json #Standard out...
276 | end
277 | end
278 | if api_response.key?('includes')
279 | results = api_response['includes']
280 | if results.key?('users')
281 | puts "Expanded user objects:"
282 | users = results['users']
283 | users.each do |user|
284 | puts user.to_json
285 | end
286 | end
287 | if results.key?('tweets')
288 | puts "Expanded Tweet objects for referenced Tweets:"
289 | tweets = results['tweets']
290 | tweets.each do |tweet|
291 | puts tweet.to_json
292 | end
293 | end
294 | if results.key?('media')
295 | puts "Expanded media objects:"
296 | media = results['media']
297 | media.each do |media|
298 | puts media.to_json
299 | end
300 | end
301 | if results.key?('places')
302 | puts "Expanded place objects:"
303 | places = results['places']
304 | places.each do |place|
305 | puts place.to_json
306 | end
307 | end
308 | if results.key?('polls')
309 | puts "Expanded poll objects:"
310 | polls = results['polls']
311 | polls.each do |poll|
312 | puts poll.to_json
313 | end
314 | end
315 | end
316 | if api_response.key?('errors')
317 | puts "Access errors:"
318 | errors = api_response['errors']
319 | errors.each do |error|
320 | puts error.to_json
321 | end
322 | end
323 |
324 | end
325 |
326 | def maintain_includes_arrays(api_response)
327 |
328 | puts "Loading 'includes' payload." if @verbose
329 | includes = api_response['includes']
330 |
331 | if not includes.nil?
332 |
333 | if includes.key?("users")
334 | puts "Adding user objects." if @verbose
335 | users = includes['users']
336 | puts "Loading 'includes' users array.." if @verbose
337 | users.each do |user|
338 | @includes['users'] << user #Pushing into a non-indexed user array that can/will have duplicates.
339 | end
340 | end
341 |
342 | if includes.key?("tweets")
343 | puts "Adding referenced Tweet objects." if @verbose
344 | tweets = includes['tweets']
345 | tweets.each do |tweet|
346 | @includes['tweets'] << tweet
347 | end
348 | end
349 |
350 | if includes.key?("media")
351 | puts "Adding media objects." if @verbose
352 | media = includes['media']
353 | media.each do |media|
354 | @includes['media'] << media
355 | end
356 | end
357 |
358 | #TODO: implement adding to array.
359 | if includes.key?("places")
360 | puts "Adding place objects." if @verbose
361 | else
362 | puts "No place objects." if @verbose
363 | end
364 |
365 | #TODO: implement adding to array.
366 | if includes.key?("polls")
367 | puts "Adding poll objects." if @verbose
368 | else
369 | puts "No poll objects." if @verbose
370 | end
371 | end
372 | end
373 |
374 | def make_data_request(query, start_time, end_time, since_id, until_id, max_results, expansions, fields, next_token)
375 |
376 | result_count = nil #Only Labs returns this.
377 |
378 | @requester.url = @url_data
379 |
380 | request_data = build_data_request(query, start_time, end_time, since_id, until_id, max_results, expansions, fields, next_token)
381 |
382 | if (Time.now - @request_timestamp) < 1
383 | sleep 1
384 | end
385 | @request_timestamp = Time.now
386 |
387 | #puts data
388 |
389 | #Labs supports GET only, premium/enterprise support GET and POST (preferred).
390 | begin
391 | response = @requester.GET(request_data)
392 |
393 | rescue
394 | puts 'Error occurred with request, retrying... '
395 | sleep 5
396 | response = @requester.GET(request_data)
397 | end
398 |
399 | if response.code.to_i > 201
400 | puts "#{response.code} error. #{response.message}. #{response.body}"
401 | error_json = JSON.parse(response.body)
402 |
403 | if response.code.to_i == 503
404 |
405 | puts "Server-side error, sleeping for 30 seconds before retrying."
406 | sleep 30
407 |
408 | elsif response.code.to_i == 429
409 | puts "Hit request rate limit, sleeping for 1 minute before retrying."
410 | sleep 60
411 | else
412 | #TODO: If we are asking about an ID too old, it would be nice to grab the suggested timestamp and Tweet ID to correct request.
413 |
414 | if error_json['errors'][0]['message'].include?('tweet id created after') and error_json['errors'][0]['message'].include?("'since_id' that is larger than")
415 | #'since_id' must be a tweet id created after [TIMESTAMP]. Please use a 'since_id' that is larger than "ID"
416 | created_after = ''
417 | id_after = 0
418 | end
419 |
420 | puts "Quitting"
421 | exit
422 | end
423 | end
424 |
425 | #Prepare to convert Search API JSON to hash.
426 | #api_response = []
427 | api_response = JSON.parse(response.body)
428 |
429 | if @write_mode == 'files' #write the file.
430 |
431 | #Each 'page' has a start and end time, go get those for generating filename.
432 |
433 | filename = ""
434 | #TODO: just pass in first timestamp: results.first['created_at']
435 | filename = get_file_name(query, api_response['data'])
436 |
437 | puts "Storing Search API data in file: #{filename}"
438 | File.open("#{@out_box}/#{filename}.json", "w") do |new_file|
439 | new_file.write(api_response.to_json)
440 | end
441 |
442 | elsif @write_mode == 'standard_out' #Standard out
443 | write_standard_out(api_response)
444 |
445 | else #if hash, load up array
446 |
447 | if api_response.key?("data")
448 | #Maintain array!
449 | tweets = api_response['data']
450 | puts 'Loading response into @tweets array..' if @verbose
451 | tweets.each do |tweet|
452 | @tweets << tweet
453 | end
454 | else
455 | puts "No Tweets." if @verbose
456 | end
457 |
458 | if api_response.key?("includes")
459 | maintain_includes_arrays(api_response)
460 | end
461 |
462 | if api_response.key?("errors")
463 | errors = api_response['errors']
464 | puts "Errors occurred." if @verbose
465 | errors.each do |error|
466 | puts "Error occurred: " + error.to_json
467 | end
468 | end
469 | end
470 |
471 | if !api_response['meta'].nil?
472 |
473 | if @verbose
474 | puts "\nResponse metadata:"
475 | puts api_response['meta']
476 | puts "\n"
477 | end
478 | end
479 |
480 | return api_response['meta']
481 |
482 | end
483 |
484 | #Make initial request, and look for 'next' token, and re-request until the 'next' token is no longer provided.
485 | def get_data(query, start_time, end_time, since_id, until_id)
486 |
487 | #Going to be making data requests, so load auth details.
488 | #bearer_token = ENV['TWITTER_BEARER_TOKEN']
489 |
490 | #next_token = nil
491 | @request_count = 0
492 | @tweet_count = 0
493 | response_metadata = {}
494 |
495 | # Handle output options. Either writing to output now, or adding to @tweets array.
496 | if @verbose
497 | case
498 | when @write_mode == 'files'
499 | puts 'Writing to files.'
500 | when @write_mode == 'standard_out'
501 | puts 'Writing to standard out.'
502 | when @write_mode == 'hash'
503 | puts 'Writing to array of Tweets.'
504 | when @write_mode == 'json'
505 | puts 'Writing to array of Tweets.'
506 | else
507 | puts "Unhandled output mode?"
508 | end
509 | end
510 |
511 | loop do
512 | @request_count += 1
513 |
514 | puts response_metadata['next_token'] if @verbose
515 |
516 | if response_metadata['next_token'] == nil
517 | #If first response, grab the meta.newest_id.
518 | first_request = true
519 | end
520 |
521 | response_metadata = make_data_request(query, start_time, end_time, since_id, until_id, @max_results, @expansions, @fields, response_metadata['next_token'])
522 |
523 | @tweet_count += response_metadata['result_count'] if not response_metadata['result_count'].nil?
524 |
525 | if first_request
526 | puts "Persisting newest ID from first request: #{response_metadata['newest_id']}" if @verbose
527 |
528 | if response_metadata.key?('newest_id')
529 | @newest_id_study = response_metadata['newest_id'].to_i
530 | else
531 | @newest_id_study = since_id
532 | end
533 |
534 | first_request = false #Do just once.
535 | end
536 |
537 | if not @exit_after == nil and @requester.request_count >= @exit_after
538 | puts "Hit request threshold of #{@exit_after} requests. Quitting at #{Time.now}."
539 | end
540 |
541 | #If we either reach the end of the token road or have made the maximum number of requests.
542 | break if response_metadata['next_token'].nil? or (not @exit_after == nil and @request_count >= @exit_after)
543 |
544 | puts "Response has 'meta.next_token', making another request... \n" if @verbose
545 |
546 | end
547 |
548 | #puts "Made #{@request_count} data requests." #if @verbose
549 | @request_count > 1 ? (puts "Made #{@request_count} data requests.") : (puts "Made #{@request_count} data request.") if @verbose
550 | @tweet_count == 1 ? (puts "Received #{@tweet_count} Tweet.") : (puts "Received #{@tweet_count} Tweets.") if @verbose
551 | puts "Next polling cycle: since_id = #{@newest_id_study}" if @verbose
552 |
553 | #These outputs are handled once at the end:
554 |
555 | #With standard out, we have completely our output work, and only need to return the newest ID.
556 | return nil, nil, @newest_id_study if @write_mode == 'standard_out'
557 |
558 | #With the 'hash' and 'json' options, we accumulated Tweet and 'includes' objects across multiple pagination requests.
559 | # So assemble the 'includes' structure
560 | return @tweets, @includes, @newest_id_study if @write_mode == 'hash'
561 | return @tweets.to_json, @includes.to_json, @newest_id_study if @write_mode == 'json'
562 | end #get_data
563 |
564 | end #SearchTweets class.
565 |
--------------------------------------------------------------------------------