├── .gitignore
├── figs
├── worldwide_normalized_tweets.pdf
└── worldwide_normalized_tweets.png
├── tweets_hydrator
├── package
│ └── tweets_hydrator.jar
├── twitter.properties
├── Readme.md
├── pom.xml
├── src
│ └── main
│ │ └── java
│ │ └── qa
│ │ └── qcri
│ │ └── tweetsretrieval
│ │ ├── TweetsRetrievalTool.java
│ │ └── TwitterAPI.java
└── sample_tweet_ids.txt
├── LICENSE
├── parsers
├── base_file_data_extractor.py
└── meta_file_parser.py
├── preprocessing
└── user_location_preprocessing.py
├── meta_data
└── meta_file_monthly_ids_range.tsv
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | Tweets hydrator/.DS_Store
2 | Preprocessing scripts/.DS_Store
3 | .DS_Store
--------------------------------------------------------------------------------
/figs/worldwide_normalized_tweets.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrisisComputing/TBCOV/HEAD/figs/worldwide_normalized_tweets.pdf
--------------------------------------------------------------------------------
/figs/worldwide_normalized_tweets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrisisComputing/TBCOV/HEAD/figs/worldwide_normalized_tweets.png
--------------------------------------------------------------------------------
/tweets_hydrator/package/tweets_hydrator.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CrisisComputing/TBCOV/HEAD/tweets_hydrator/package/tweets_hydrator.jar
--------------------------------------------------------------------------------
/tweets_hydrator/twitter.properties:
--------------------------------------------------------------------------------
1 | consumer.key=XXXXXXXXXXXXXXXX
2 | consumer.secret=XXXXXXXXXXXXXXXX
3 | access.token=XXXXXXXXXXXXXXXX
4 | access.token.secret=XXXXXXXXXXXXXXXX
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 CrisisComputing
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tweets_hydrator/Readme.md:
--------------------------------------------------------------------------------
1 | # Tweets Hydrator Usage Guide
2 |
3 | ## Description
4 | This Java-based program hydrates tweets from the Twiteer APIs. The tool makes 180 API calls per 15 minutes. Each API call downloads up to 100 tweets i.e. it can download up to 72,000 tweets per hour.
5 |
6 | ## How to use
7 |
8 | 1. Add tweets ids in a text file (one per line). A sample tweets-ids file is provided in the package.
9 | 2. Make a Twitter app (if you don't have one) to get the following four tokens. Once obtained, add them into the `twitter.properties` file.
10 |
11 | `consumer.key=XXXX`
12 |
13 | `consumer.secret=XXXX`
14 |
15 | `access.token=XXXX`
16 |
17 | `access.token.secret=XXXX`
18 |
19 | 3. Run the `tweets_hydrrator.jar` file from the package folder as shown in the following command. The command excepts two parameters. The first parameter is the file containing tweets-ids. And, the second parameter is the path and name of output file where the tool should store the downloaded tweets.
20 |
21 | `java -classpath TweetsRetrieval-1.2-jar-with-dependencies.jar qa.qcri.tweetsretrieval.TweetsRetrievalTool sample_tweet_ids.txt output.txt`
22 |
23 | ## Compilation
24 | In case of new changes, compilation can be done using this command.
25 | `mvn clean compile assembly:single`
26 |
--------------------------------------------------------------------------------
/tweets_hydrator/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | qa.qcri
5 | TweetsRetrieval
6 | 1.2
7 | jar
8 |
9 | UTF-8
10 | 1.8
11 | 1.8
12 |
13 |
14 |
15 | oauth.signpost
16 | signpost-core
17 | 1.2.1.2
18 |
19 |
20 | javax.json
21 | javax.json-api
22 | 1.0
23 |
24 |
25 | org.glassfish
26 | javax.json
27 | 1.0.4
28 | runtime
29 |
30 |
31 |
32 |
33 |
34 | maven-assembly-plugin
35 |
36 |
37 |
38 | qa.qcri.tweetsretrieval.TweetsRetrievalTool
39 |
40 |
41 |
42 | jar-with-dependencies
43 |
44 |
45 |
46 |
47 |
48 |
--------------------------------------------------------------------------------
/tweets_hydrator/src/main/java/qa/qcri/tweetsretrieval/TweetsRetrievalTool.java:
--------------------------------------------------------------------------------
1 | package qa.qcri.tweetsretrieval;
2 |
3 | import java.io.File;
4 | import java.io.FileInputStream;
5 | import java.io.FileWriter;
6 | import java.io.IOError;
7 | import java.io.IOException;
8 | import java.io.Writer;
9 | import java.nio.file.Files;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Properties;
13 | import java.util.stream.Collectors;
14 |
15 | import javax.json.JsonArray;
16 |
17 | public class TweetsRetrievalTool {
18 |
19 | private static final String NEWLINE = System.getProperty("line.separator");
20 |
21 | public static void main(String[] args) throws IOException {
22 | if (args.length<2) {
23 | System.err.println("This app needs two parameters: source and destination files.");
24 | return;
25 | }
26 |
27 | Properties p = new Properties();
28 | p.load(new FileInputStream("twitter.properties"));
29 |
30 | TwitterAPI twitter = new TwitterAPI(p);
31 |
32 | List lines = Files.readAllLines(new File(args[0]).toPath());
33 | int[] r = new int[]{0};
34 | Map> groups = lines.stream()
35 | .collect(Collectors.groupingBy(x -> r[0]++ / 100));
36 |
37 | try(Writer dest = new FileWriter(new File(args[1]))) {
38 | groups.forEach((key, value) -> {
39 | List ids = value.stream()
40 | .map(s -> s.replace("'", ""))
41 | .collect(Collectors.toList());
42 | try {
43 | JsonArray tweets =
44 | (JsonArray) twitter.getStatusesLookup(String.join(",", ids));
45 | tweets.forEach(t -> {
46 | try {
47 | dest.write(t.toString());
48 | dest.write(NEWLINE);
49 | } catch (IOException e) {
50 | throw new IOError(e);
51 | }
52 | });
53 | //System.out.println(key+" "+tweets.size()+"/"+ids.size());
54 | } catch (Exception e) {
55 | throw new IOError(e);
56 | }
57 | });
58 | }
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/parsers/base_file_data_extractor.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | #Sample output run
4 | #python base_file_data_extractor.py required_monthly_files.txt test_for_meta_parsing.txt '/some/path'
5 |
6 |
7 | #Pass the output of the previous script to this script
8 | meta_parser_output = open(sys.argv[1])
9 |
10 |
11 |
12 | #Pass the IDs file through which you want to extract all the tweets/full details
13 | ids_file = open(sys.argv[2])
14 |
15 | #base release files path #expects '/' at the end
16 | base_release_file_path = sys.argv[3]
17 |
18 | #output_file
19 | output_file = open(sys.argv[2].replace('.txt', '') + "_detailed.tsv", "w+")
20 |
21 |
22 | #This function is for inserting a value in a dictionary
23 | def insert_in_dict(dictionary, key, value):
24 | if key in dictionary:
25 | dictionary[key] = value
26 | else:
27 | if key != None:
28 | dictionary[key] = value
29 |
30 | return dictionary
31 |
32 |
33 | ids_dict = {}
34 |
35 | for line in ids_file:
36 | insert_in_dict(ids_dict, line.strip(), True)
37 |
38 | ids_file.close()
39 |
40 | output_file.write("tweet_id date_time lang user_id retweeted_id quoted_id in_reply_to_id sentiment_conf sentiment_label user_type gender_label tweet_text_named_entities geo_coordinates_lat_lon geo_country_code geo_state geo_county geo_city place_bounding_box place_country_code place_state place_county place_city user_loc_toponym user_loc_country_code user_loc_state user_loc_county user_loc_city User_profile_description_toponyms user_profile_description_country_code user_profile_description_state user_profile_description_county user_profile_description_city tweet_text_toponyms tweet_text_country_code tweet_text_state tweet_text_county tweet_text_city")
41 | output_file.write('\n')
42 | print('\n')
43 | print("Parsing stated... wait for the data extraction....")
44 |
45 | #Extract the required full items
46 | for key in meta_parser_output:
47 |
48 | file_to_be_read = open(base_release_file_path + key.strip())
49 |
50 |
51 | for full_info_line in file_to_be_read:
52 |
53 | data = full_info_line.split('\t')
54 |
55 | if(ids_dict.get(data[0]) != None):
56 | output_file.write(full_info_line)
57 |
58 | print('\n')
59 | print(key.strip() + " file completely parsed *******")
60 |
61 | output_file.close()
62 |
63 | meta_parser_output.close()
64 |
65 |
--------------------------------------------------------------------------------
/parsers/meta_file_parser.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from tqdm import tqdm
3 |
4 | #Sample output run
5 | #python meta_file_parser.py test_for_meta_parsing.txt meta_data/meta_file_monthly_ids_range.tsv
6 |
7 |
8 | #Pass the country/lang file that you want to extract
9 | opened_file = open(sys.argv[1])
10 |
11 | #Pass the meta-file as provided in the repo
12 | meta_file_opened = open(sys.argv[2])
13 |
14 | meta_file_dict = {}
15 |
16 | output_file = open(str(sys.argv[1].split('.')[0]) + "_required_monthly_files.txt", "w+")
17 |
18 |
19 | #This function is for inserting a value in a dictionary
20 | def insert_in_dict(dictionary, key, value):
21 | #check key
22 | # key
23 | # value
24 | if key in dictionary:
25 | dictionary[key] = value
26 | else:
27 | if key != None:
28 | dictionary[key] = value
29 |
30 | return dictionary
31 |
32 |
33 | #start from the second line
34 | next(meta_file_opened)
35 |
36 | #Loading the meta-file in a dictionary
37 | for meta_line in meta_file_opened:
38 |
39 | meta_array = meta_line.strip().split('\t')
40 |
41 | #print(meta_array)
42 |
43 | insert_in_dict(meta_file_dict, meta_array[0], [int(meta_array[1]), int(meta_array[2])])
44 |
45 | meta_file_opened.close()
46 |
47 | print("\n")
48 | print("File 'required_monthly_files.txt' is being generated, please wait...")
49 | print("\n")
50 |
51 | #Which files need to be used
52 | files_to_be_downloaded = set()
53 |
54 | num_lines = sum(1 for line in open(sys.argv[1],'r'))
55 |
56 | # num_lines = sum(1 for line in opened_file)
57 | #To check which files are needed to be downloaded, run this part.
58 | # with Bar('Processing...') as bar:
59 | # for i in tqdm(range(100)):
60 | for line in tqdm(opened_file, total=num_lines):
61 |
62 | tweet_id = int(line.strip())
63 |
64 | for key in meta_file_dict:
65 | start_id = meta_file_dict[key][0]
66 | end_id = meta_file_dict[key][1]
67 |
68 | if(tweet_id >= start_id and tweet_id <= end_id):
69 | #files_to_be_downloaded.add(key.split('_')[0].capitalize() + " "+ key.split('_')[1])
70 | files_to_be_downloaded.add(key)
71 | # bar.next()
72 |
73 | #list of all files
74 | list_files = []
75 |
76 | #output file names and then take input from reader
77 | for value in files_to_be_downloaded:
78 | list_files.append(value)
79 |
80 | list_files.sort()
81 |
82 | for i in range(0, len(list_files)):
83 | #output in a text file
84 | output_file.write(list_files[i] + '\n')
85 |
86 | opened_file.close()
87 | output_file.close()
88 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/tweets_hydrator/src/main/java/qa/qcri/tweetsretrieval/TwitterAPI.java:
--------------------------------------------------------------------------------
1 | package qa.qcri.tweetsretrieval;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 | import java.net.HttpURLConnection;
6 | import java.net.URL;
7 | import java.util.Properties;
8 | import java.util.logging.Logger;
9 | import java.util.zip.GZIPInputStream;
10 |
11 | import javax.json.Json;
12 | import javax.json.JsonArray;
13 | import javax.json.JsonReader;
14 |
15 | import oauth.signpost.OAuthConsumer;
16 | import oauth.signpost.basic.DefaultOAuthConsumer;
17 | import oauth.signpost.exception.OAuthException;
18 |
19 | public class TwitterAPI {
20 |
21 | private static final Logger log = Logger.getLogger(TwitterAPI.class.getName());
22 | private static final String BASE = "https://api.twitter.com";
23 |
24 | private OAuthConsumer consumer;
25 |
26 | public TwitterAPI(Properties p) {
27 | consumer = new DefaultOAuthConsumer(p.getProperty("consumer.key"), p.getProperty("consumer.secret"));
28 | consumer.setTokenWithSecret(p.getProperty("access.token"), p.getProperty("access.token.secret"));
29 | }
30 |
31 | protected Object call(String url) throws IOException, InterruptedException, OAuthException {
32 | URL url2 = new URL(consumer.sign(url));
33 | HttpURLConnection conn = (HttpURLConnection) url2.openConnection();
34 | conn.setRequestProperty("Accept-Encoding", "gzip");
35 | int rc = conn.getResponseCode();
36 | InputStream stream = null;
37 | switch (rc) {
38 | case HttpURLConnection.HTTP_OK:
39 | stream = conn.getInputStream();
40 | break;
41 | case HttpURLConnection.HTTP_FORBIDDEN:
42 | stream = conn.getErrorStream();
43 | break;
44 | case 429: // https://dev.twitter.com/rest/public/rate-limiting
45 | long reset = 1000 * conn.getHeaderFieldLong("X-Rate-Limit-Reset", 0);
46 | long millis = reset - System.currentTimeMillis();
47 | if (millis < 60000) {
48 | log.info(String.format("waiting for %d sec (API call rate limit exceeded)", millis/1_000));
49 | } else {
50 | log.info(String.format("waiting for %d min (API call rate limit exceeded)", millis/60_000));
51 | }
52 | Thread.sleep(millis+1000);
53 | return call(url);
54 | default:
55 | throw new IOException(conn.getResponseMessage());
56 | }
57 | if ("gzip".equals(conn.getHeaderField("Content-Encoding")))
58 | stream = new GZIPInputStream(stream);
59 |
60 | int limit = conn.getHeaderFieldInt("X-Rate-Limit-Limit", 0);
61 | int remaining = conn.getHeaderFieldInt("X-Rate-Limit-Remaining", 0);
62 | long reset = conn.getHeaderFieldLong("X-Rate-Limit-Reset", 0);
63 | log.info(String.format("%d/%d(%d)", limit, remaining, reset));
64 |
65 | JsonReader in = Json.createReader(stream);
66 | return in.read();
67 | }
68 |
69 | public JsonArray getStatusesLookup(String jobid) throws IOException, InterruptedException, OAuthException {
70 | String endpoint = BASE+"/1.1/statuses/lookup.json?id="+jobid;
71 | return (JsonArray) call(endpoint);
72 | }
73 |
74 | }
75 |
--------------------------------------------------------------------------------
/preprocessing/user_location_preprocessing.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3.7.0
2 |
3 | # -*- coding: utf-8 -*-
4 | from cleantext import clean
5 | import re
6 | import emoji
7 | import unicodedata
8 | import sys
9 |
10 | PUNCT_TRANSLATE_UNICODE = dict.fromkeys(
11 | (i for i in range(sys.maxunicode) if unicodedata.category(chr(i)).startswith("P")),
12 | " ",
13 | )
14 |
15 | def preprocess_user_location(text):
16 |
17 | if(text == None):
18 | return None
19 | text = text.strip()
20 | # remove URLs
21 | text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', ' ', text)
22 | text = re.sub(r'http\S+', ' ', text)
23 | # remove usernames
24 | text = re.sub('@[^\s]+', ' ', text)
25 | # remove the # in #hashtag
26 | text = re.sub(r'#([^\s]+)', r'\1', text)
27 |
28 | #remove emojies
29 | text = emoji.get_emoji_regexp().sub(u'', text)
30 |
31 | text = text.replace('\n',' ')
32 | text = text.replace('\t',' ')
33 | text = text.replace('\r',' ')
34 | text = text.replace('"',' ')
35 | text = text.replace('~',' ')
36 | text = text.replace('|',' ')
37 | text = text.replace(',','<<<>>>')
38 |
39 |
40 | preprocessed_text = clean(text,
41 | fix_unicode=True, # fix various unicode errors
42 | to_ascii=True, # transliterate to closest ASCII representation
43 | lower=False, # lowercase text
44 | no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
45 | no_urls=True, # replace all URLs with a special token
46 | no_emails=True, # replace all email addresses with a special token
47 | no_phone_numbers=True, # replace all phone numbers with a special token
48 | no_numbers=False, # replace all numbers with a special token
49 | no_digits=False, # replace all digits with a special token
50 | no_currency_symbols=True, # replace all currency symbols with a special token
51 | no_punct=True, # fully remove punctuation
52 | replace_with_url="",
53 | replace_with_email="",
54 | replace_with_phone_number="",
55 | replace_with_currency_symbol="",
56 | # replace_with_punct=" ",
57 | # lang="en" # set to 'de' for German special handling
58 | )
59 |
60 | #accent remove
61 | text = text.translate(PUNCT_TRANSLATE_UNICODE)
62 |
63 | text = text.replace('<<<>>>',',')
64 |
65 | #remove extra spaces
66 | text = re.sub(' +', ' ', text)
67 |
68 |
69 | #print("Final: "+ text)
70 |
71 | # to avoid cases like ', '
72 | #to avoid cases like ' ', 'a', ',' etc
73 | #to avoid cases like ' E',
74 | if(len(text.replace(',','')) > 1 and len(text.replace(',','').replace(' ','')) > 1 and len(text.replace(' ','')) > 1 and len(text) > 1 and text != '' and text != ' '):
75 | return text
76 | else:
77 | return None
78 |
79 | #some tests
80 | print(preprocess_user_location('Doha, Qatar ####'))
81 | #Converts it to Doha, Qatar
82 |
83 | print(preprocess_user_location(' USA '))
84 | #Converts it to USA
85 |
--------------------------------------------------------------------------------
/meta_data/meta_file_monthly_ids_range.tsv:
--------------------------------------------------------------------------------
1 | File_name Start_id End_id
2 | february_2020_f1.tsv 1223395535882768385 1231201257739649025
3 | february_2020_f2.tsv 1231201258092011523 1233487061354762240
4 | february_2020_f3.tsv 1233487061400813568 1233904783833976833
5 | march_2020_f1.tsv 1233904784010358785 1235614779965927425
6 | march_2020_f10.tsv 1244321291236245511 1245075687599464448
7 | march_2020_f11.tsv 1245075687767035905 1245138807797710849
8 | march_2020_f2.tsv 1235614780003684352 1237088322662645760
9 | march_2020_f3.tsv 1237088322696159232 1238201450951565313
10 | march_2020_f4.tsv 1238201451123531778 1239190893497901056
11 | march_2020_f5.tsv 1239190893506199552 1240091884468555776
12 | march_2020_f6.tsv 1240091884472565761 1241643047554887681
13 | march_2020_f7.tsv 1241643047559061506 1242485419352002573
14 | march_2020_f8.tsv 1242485419457097734 1243478992633491457
15 | march_2020_f9.tsv 1243478992641982467 1244321291227877382
16 | april_2020_f1.tsv 1245138807957213188 1245885018100006913
17 | april_2020_f10.tsv 1252230722540060672 1253025036719599616
18 | april_2020_f11.tsv 1253025036736385030 1253833370259333120
19 | april_2020_f12.tsv 1253833370284380161 1254690078980362241
20 | april_2020_f13.tsv 1254690079030812672 1255591483429793802
21 | april_2020_f14.tsv 1255591483450802176 1256010443749015554
22 | april_2020_f2.tsv 1245885018150277120 1246767531500670977
23 | april_2020_f3.tsv 1246767531504828416 1247560749041496065
24 | april_2020_f4.tsv 1247560749066698753 1248282920172863488
25 | april_2020_f5.tsv 1248282920185282560 1249049719042527232
26 | april_2020_f6.tsv 1249049719046901760 1249852958428278787
27 | april_2020_f7.tsv 1249852958508023809 1250639237420122113
28 | april_2020_f8.tsv 1250639237441032192 1251425458081628163
29 | april_2020_f9.tsv 1251425458094129152 1252230722485579778
30 | may_2020_f1.tsv 1256010443790811137 1257060910776102915
31 | may_2020_f10.tsv 1265592235187453959 1267121083511169024
32 | may_2020_f11.tsv 1267121083553021952 1267244467603550212
33 | may_2020_f2.tsv 1257060910792916992 1258065104886730752
34 | may_2020_f3.tsv 1258065104941322240 1259114424977195008
35 | may_2020_f4.tsv 1259114425010667520 1260172229393780736
36 | may_2020_f5.tsv 1260172229452726272 1261204076987121666
37 | may_2020_f6.tsv 1261204076991254528 1262280725505560577
38 | may_2020_f7.tsv 1262280725518258176 1263351336470380546
39 | may_2020_f8.tsv 1263351336516681728 1264481336284327937
40 | may_2020_f9.tsv 1264481336401825798 1265592235166498816
41 | june_2020_f2.tsv 1268678185518010368 1270188480736325632
42 | june_2020_f3.tsv 1270188480770068480 1271734074185527296
43 | june_2020_f4.tsv 1271734074269466624 1273249326085660675
44 | june_2020_f5.tsv 1273249326135984133 1274813759215681536
45 | june_2020_f6.tsv 1274813759488430086 1276368684643016705
46 | june_2020_f7.tsv 1276368684680753152 1278116103579893760
47 | july_2020_f1.tsv 1278116103714193408 1279672350317916162
48 | july_2020_f2.tsv 1279672350322184193 1281127821038030852
49 | july_2020_f3.tsv 1281127821105135617 1282607432242466816
50 | july_2020_f4.tsv 1282607432259252224 1283761237512163329
51 | july_2020_f5.tsv 1283761237596282880 1285121873857183745
52 | july_2020_f6.tsv 1285121873886375936 1286598075998449664
53 | july_2020_f7.tsv 1286598076128468994 1288118797161783304
54 | july_2020_f8.tsv 1288118797325357061 1289350127384059906
55 | june_2020_f1.tsv 1267244467695824897 1268678185484419073
56 | august_2020_f1.tsv 1289350127459606528 1290914738906554368
57 | august_2020_f2.tsv 1290914738944086016 1292487386212048896
58 | august_2020_f3.tsv 1292487386375622656 1294236478365741056
59 | august_2020_f4.tsv 1294236478394920960 1295945048639635457
60 | august_2020_f5.tsv 1295945048782249985 1297582288671330310
61 | august_2020_f6.tsv 1297582288792936448 1299160074728865792
62 | august_2020_f7.tsv 1299160074741374977 1300584151045885956
63 | september_2020_f1.tsv 1300584151356186629 1302269940070780928
64 | september_2020_f2.tsv 1302269940104417285 1303958352255483904
65 | september_2020_f3.tsv 1303958352289030149 1305661526779596800
66 | september_2020_f4.tsv 1305661526867611648 1310597145033158658
67 | september_2020_f5.tsv 1310597145075101697 1311455787064217603
68 | october_2020_f1.tsv 1311455787303268354 1312899927576436737
69 | october_2020_f2.tsv 1312899927580848134 1314318111148068866
70 | october_2020_f3.tsv 1314318111202586624 1315998364996120577
71 | october_2020_f4.tsv 1315998365000359936 1317864059375529984
72 | october_2020_f5.tsv 1317864059409031170 1319666221038567425
73 | october_2020_f6.tsv 1319666221055234048 1321479885110546433
74 | october_2020_f7.tsv 1321479885139968000 1322689810998480898
75 | november_2020_f1.tsv 1322689811350724611 1324795676618858501
76 | november_2020_f2.tsv 1324795676811821057 1326866251117965317
77 | november_2020_f3.tsv 1326866251222806528 1328700302992826368
78 | november_2020_f4.tsv 1328700302997024771 1330567656244609024
79 | november_2020_f5.tsv 1330567656299253762 1332444617116057600
80 | november_2020_f6.tsv 1332444617225211907 1333561446932865024
81 | december_2020_f1.tsv 1333561447054499842 1335345359833337856
82 | december_2020_f2.tsv 1335345359959298055 1337051156506435586
83 | december_2020_f3.tsv 1337051156598693898 1338884363908476929
84 | december_2020_f4.tsv 1338884363967229955 1340433130780540928
85 | december_2020_f5.tsv 1340433131057233921 1341804754494697474
86 | december_2020_f6.tsv 1341804754561691650 1343612648198438913
87 | december_2020_f7.tsv 1343612648227745805 1344795470808420352
88 | january_2021_f1.tsv 1344795470900715522 1346349629365837824
89 | january_2021_f2.tsv 1346349629437341696 1347920318686441472
90 | january_2021_f3.tsv 1347920318761992193 1349513280482631683
91 | january_2021_f4.tsv 1349513280688046081 1350963889895256064
92 | january_2021_f5.tsv 1350963890079862786 1352429045670162433
93 | january_2021_f6.tsv 1352429045716115457 1353877599127134209
94 | january_2021_f7.tsv 1353877599165022209 1355348167848566790
95 | january_2021_f8.tsv 1355348167861137410 1356029494600065025
96 | february_2021_f1.tsv 1356029494696534021 1357596728745201664
97 | february_2021_f2.tsv 1357596728778772480 1359330790413369355
98 | february_2021_f3.tsv 1359330790547591169 1361176857119350784
99 | february_2021_f4.tsv 1361176857123692545 1362859197721956353
100 | february_2021_f5.tsv 1362859197784678400 1364685866174533632
101 | february_2021_f6.tsv 1364685866191495177 1366176354841018368
102 | march_2021_f1.tsv 1366176354941730818 1367814444961722368
103 | march_2021_f2.tsv 1367814445112692745 1369646816979947523
104 | march_2021_f3.tsv 1369646817038774273 1371280166681477121
105 | march_2021_f4.tsv 1371280166853431304 1372929918892380169
106 | march_2021_f5.tsv 1372929918976233485 1374584034257342464
107 | march_2021_f6.tsv 1374584034316099587 1376276403436580864
108 | march_2021_f7.tsv 1376276403457511431 1377410378527744003
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TBCOV: Two Billion Multilingual COVID-19 Tweets with Sentiment, Entity, Geo, and Gender Labels
2 | Welcome to the code repository for the TBCOV dataset. This code repository offers several scripts helpful to hydrate and process the shared dataset.
3 |
4 | The TBCOV dataset comprises more than two billion tweets from 218 countries worldwide. The following map shows worldwide tweets normalized by the total population from each country per 100,000 persons.
5 |
6 | 
7 |
8 | # Data descriptor for base release files
9 | |Attribute|Type|Description|
10 | |--- |--- |--- |
11 | |tweet_id|Int64|The integer representation of the unique identifier a tweet. This number is greater than 53 bits and some programming languages may have difficulty/silent defects in interpreting it.|
12 | |date_time|String|UTC time when the tweet was created.|
13 | |lang|String|ISO-6391 Alpha-2 language code consisting of two characters.|
14 | |user_id|String|Represents the id of the author of the tweet.|
15 | |retweeted_id|Int64|If the tweet is a retweet, the retweeted_id represents the id of the parent tweet.|
16 | |quoted_id|Int64|If the tweet is a quoted tweet, the quoted_id represents the id of the parent tweet.|
17 | |in_reply_to_id|Int64|If the tweet is a reply to an existing tweet, the in_reply_to_id represents the id of the parent/original tweet.|
18 | |sentiment_label|Int64|Represents the sentiment label values: -1 (negative), 0 (neutral), 1 (positive).|
19 | |sentiment_conf|Float|Represents the confidence score of the sentiment classifier for a given sentiment label to a tweet.|
20 | |user_type|String|The user types represents the identified type of the user such as person, organization, location, etc.|
21 | |gender_label|String|One character code representing the identified gender of the users. F represents "female" and M represents "male" user types.|
22 | |tweet_text_named_entities|Dictionary array|Named-entities (persons, organizations, locations, etc.) extracted from tweet text are provided in this attribute in the array of dictionary format.|
23 | |geo_coordinates_lat_lon|Float|GPS coordinates in the latitude, longitude format retrieved from the user's GPS-enabled device.|
24 | |geo_country_code|String|Two characters country code learned through resolving the GPS coordinates (latitude, longitude).|
25 | |geo_state|String|The name of the state/province learned through resolving the GPS coordinates (latitude, longitude).|
26 | |geo_county|String|The name of the county learned through resolving the GPS coordinates (latitude, longitude).|
27 | |geo_city|String|The name of the city learned through resolving the GPS coordinates (latitude, longitude).|
28 | |place_bounding_box|Float|Twitter provided bounding boxes representing place tags.|
29 | |place_country_code|String|Two characters country code learned through resolving the place bounding boxes.|
30 | |place_state|String|The name of the state/province learned through resolving the place bounding boxes.|
31 | |place_county|String|The name of the county learned through resolving the place bounding boxes.|
32 | |place_city|String|The name of the city learned through resolving the place bounding boxes.|
33 | |user_loc_toponyms|Dictionary array|Toponyms recognized and extracted from the user location field provided as an array of dictionary.|
34 | |user_loc_country_code|String|Two characters country code learned through resolving the user location toponyms.|
35 | |user_loc_state|String|The name of the state/province learned through resolving the user location toponyms.|
36 | |user_loc_county|String|The name of the county learned through resolving the user location toponyms.|
37 | |user_loc_city|String|The name of the city learned through resolving the user location toponyms.|
38 | |user_profile_description_toponyms|Dictionary array|Toponyms recognized and extracted from the user profile description field provided as an array of dictionary.|
39 | |user_profile_description_country_code|String|Two characters country code learned through resolving the recognized user profile description toponyms.|
40 | |user_profile_description_state|String|The name of the state/province learned through resolving the recognized user profile description toponyms.|
41 | |user_profile_description_county|String|The name of the county learned through resolving the recognized user profile description toponyms.|
42 | |user_profile_description_city|String|The name of the city learned through resolving the recognized user profile description toponyms.|
43 | |tweet_text_toponyms|Dictionary array|Toponyms recognized and extracted from the tweet full_text field in the dictionary array format.|
44 | |tweet_text_country_code|String|Two characters country code learned through resolving the recognized tweet text toponyms.|
45 | |tweet_text_state|String|The name of the state/province learned through resolving the recognized tweet text toponyms.|
46 | |tweet_text_county|String|The name of the county learned through resolving the recognized tweet text toponyms.|
47 | |tweet_text_city|String|The name of the city learned through resolving the recognized tweet text toponyms.|
48 |
49 | # Tweets hydration
50 | The tweets hydration process fetches full tweet content from Twitter using tweet-ids. To assist users with hydrating TBCOV tweets, this code reposity
51 | provides a tool written in the Java language that takes tweet-ids as input and retrieves full tweet content from Twitter APIs. More details and a usage guide of the Tweets hydrator are available [here](https://github.com/CrisisComputing/TBCOV/tree/main/tweets_hydrator).
52 |
53 | # Preprocessing
54 | Different types of preprocessing were applied on different attributes before using them for any analysis. The preprocessing is important to replicate results. The code reposity provides several scripts used to preprocess different fileds. The preprocessing scripts are avaialablel [here](https://github.com/CrisisComputing/TBCOV/tree/main/preprocessing).
55 |
56 | # Meta-data file
57 | The meta-data file provides a convenient and faster way to retrieve tweets from the base files. The meta-data file holds the start and the end tweet-id of all base files. So, given a tweet-id file (e.g., a language or a country), the provided script determines which base files to parse to retrieve tweets matching the ids instead of parsing all two billion tweets.
58 |
59 | [meta_file_monthly_ids_range.tsv](https://github.com/CrisisComputing/TBCOV/blob/main/meta_data/meta_file_monthly_ids_range.tsv) file lists range of tweet IDs (between Start_id and End_id) contained in the specific monthly base file as follows:
60 | |File_name|Start_id|End_id|
61 | |--- |--- |--- |
62 | |february_2020_f1.tsv|1223395535882768385|1231201257739649025|
63 |
64 | # Parsing using meta-file
65 | In the folder [parsers](https://github.com/CrisisComputing/TBCOV/tree/main/parsers), there are two scripts that are needed to extract the tweet details from base release files, given a specific language or country IDs file.
66 |
67 | * [meta_file_parser.py](https://github.com/CrisisComputing/TBCOV/blob/main/parsers/meta_file_parser.py) requires two arguments as input.
68 | 1. Country/Language IDs file
69 | 1. [meta_file_monthly_ids_range.tsv](https://github.com/CrisisComputing/TBCOV/blob/main/meta_data/meta_file_monthly_ids_range.tsv)
70 |
71 | Sample to run the script is as follows:
72 |
73 | `python meta_file_parser.py [IDs file] meta_data/meta_file_monthly_ids_range.tsv`
74 |
75 | And it creates an output file name `required_monthly_files.txt`
76 | The contents of the above file look something like this:
77 | ```bash
78 | february_2020_f3.tsv
79 | february_2020_f2.tsv
80 | february_2020_f1.tsv
81 | ```
82 |
83 | Download the above files from [Crisis-NLP TBCov](https://crisisnlp.qcri.org/tbcov)
84 | It lists the monthly base files required to extract the full data from IDs files. Make sure to download the full monthly base zip file to get the individual files stated in the above `required_monthly_files.txt` file.
85 |
86 | Each line indicates which monthly base file is required for download so that it can be used to extract tweet details with the help of the next script.
87 |
88 | * [base_file_data_extractor.py](https://github.com/CrisisComputing/TBCOV/blob/main/parsers/base_file_data_extractor.py) requires three arguments as input.
89 | 1. `required_monthly_files.txt` which the output of the previous script
90 | 1. Country/Language IDs file
91 | 1. Base release files path (expects '/' at the end - for example - /home/downloads/)
92 |
93 | Sample to run the script is as follows:
94 | `python base_file_data_extractor.py required_monthly_files.txt test_for_meta_parsing.txt '/some/path/'`
95 |
96 | The output will be a .tsv file which will have the same formate as montly base files.
97 |
--------------------------------------------------------------------------------
/tweets_hydrator/sample_tweet_ids.txt:
--------------------------------------------------------------------------------
1 | '503642976422096896'
2 | '503642976976113664'
3 | '503642977588494337'
4 | '503642979429388288'
5 | '503642980881022976'
6 | '503642982189252608'
7 | '503642983666032640'
8 | '503642984429395968'
9 | '503642985167609856'
10 | '503642985431830528'
11 | '503642988522647552'
12 | '503642992541171712'
13 | '503642993677443072'
14 | '503642995879849985'
15 | '503642996584513536'
16 | '503642998664884225'
17 | '503642998832242689'
18 | '503643000913006593'
19 | '503643001185636352'
20 | '503643001722118144'
21 | '503643002875969536'
22 | '503643003508883458'
23 | '503643004759195650'
24 | '503643009817128960'
25 | '503643010127499264'
26 | '503643011990183936'
27 | '503643013852463104'
28 | '503643016653860864'
29 | '503643018906570752'
30 | '503643019653181440'
31 | '503643027076694016'
32 | '503643028591218688'
33 | '503643029559734272'
34 | '503643031367467008'
35 | '503643033016233985'
36 | '503643033280462848'
37 | '503643033905410048'
38 | '503643033909202947'
39 | '503643036996227073'
40 | '503643039764860932'
41 | '503643040171700225'
42 | '503643042926977025'
43 | '503643044642848768'
44 | '503643044827394048'
45 | '503643045305516033'
46 | '503643045867188224'
47 | '503643050128592896'
48 | '503643051378483200'
49 | '503643054327484418'
50 | '503643056567255040'
51 | '503643056629768192'
52 | '503643062715703296'
53 | '503643062913212416'
54 | '503643063450083328'
55 | '503643063529398272'
56 | '503643073335685120'
57 | '503643074514272256'
58 | '503643077836546048'
59 | '503643080005013505'
60 | '503643081066184704'
61 | '503643081871089664'
62 | '503643093589958657'
63 | '503643095204782081'
64 | '503643096567926784'
65 | '503643096601858048'
66 | '503643097293914112'
67 | '503643098891575296'
68 | '503643100204773377'
69 | '503643104709066752'
70 | '503643105338216448'
71 | '503643105485004800'
72 | '503643109620588544'
73 | '503643111206027264'
74 | '503643112363671552'
75 | '503643113793933312'
76 | '503643117308764160'
77 | '503643117648486400'
78 | '503643124787609600'
79 | '503643126141947904'
80 | '503643126683009025'
81 | '503643127132221440'
82 | '503643127253450753'
83 | '503643128931565568'
84 | '503643128994086912'
85 | '503643130059829248'
86 | '503643130923466753'
87 | '503643131624292352'
88 | '503643136636506112'
89 | '503643137663725568'
90 | '503643137835675648'
91 | '503643138674528256'
92 | '503643138968133632'
93 | '503643143729082369'
94 | '503643144148099072'
95 | '503643144991567872'
96 | '503643146115223552'
97 | '503643149374205952'
98 | '503643149776875521'
99 | '503643151970869249'
100 | '503643154797854720'
101 | '503643156358115328'
102 | '503643156668096513'
103 | '503643156941111296'
104 | '503643158580690944'
105 | '503643158748880896'
106 | '503643159088218112'
107 | '503643159239622656'
108 | '503643162515349505'
109 | '503643163857526784'
110 | '503643164834803712'
111 | '503643166453825537'
112 | '503643168324071424'
113 | '503643168941015040'
114 | '503643169402003456'
115 | '503643169524047873'
116 | '503643171327602689'
117 | '503643171352739842'
118 | '503643171775983616'
119 | '503643175466983425'
120 | '503643175550861314'
121 | '503643176272265216'
122 | '503643180412436480'
123 | '503643180508909570'
124 | '503643180756393984'
125 | '503643181620015104'
126 | '503643183675219968'
127 | '503643187034853376'
128 | '503643187798228992'
129 | '503643187802406912'
130 | '503643187903488000'
131 | '503643190054764544'
132 | '503643190675505153'
133 | '503643190864257024'
134 | '503643191061401601'
135 | '503643191531167744'
136 | '503643192734916608'
137 | '503643194832060417'
138 | '503643195897810944'
139 | '503643196111327232'
140 | '503643196661174272'
141 | '503643196945997825'
142 | '503643197294514176'
143 | '503643197919469568'
144 | '503643197969424384'
145 | '503643198116225024'
146 | '503643202125955073'
147 | '503643202403196929'
148 | '503643204156006400'
149 | '503643204865236992'
150 | '503643205280468992'
151 | '503643205879861250'
152 | '503643206219624450'
153 | '503643207486283776'
154 | '503643208169967616'
155 | '503643209151807490'
156 | '503643209671925761'
157 | '503643209894211587'
158 | '503643210288496643'
159 | '503643210330435584'
160 | '503643210892464128'
161 | '503643212108795904'
162 | '503643212112994305'
163 | '503643213979070465'
164 | '503643214180392961'
165 | '503643214834724864'
166 | '503643214943764480'
167 | '503643214998282241'
168 | '503643215950389249'
169 | '503643216084602880'
170 | '503643216424357888'
171 | '503643216952844288'
172 | '503643217514868736'
173 | '503643218278240256'
174 | '503643218714439680'
175 | '503643219885031424'
176 | '503643220949991424'
177 | '503643221218824192'
178 | '503643222598356992'
179 | '503643227074080768'
180 | '503643227858030593'
181 | '503643229460262912'
182 | '503643232266620929'
183 | '503643233834893313'
184 | '503643234498015232'
185 | '503643235265576960'
186 | '503643236217270274'
187 | '503643236267982850'
188 | '503643237022564352'
189 | '503643238574456832'
190 | '503643238712872961'
191 | '503643239782420480'
192 | '503643240000540672'
193 | '503643244337451008'
194 | '503643245361242112'
195 | '503643250842796032'
196 | '503643253875286016'
197 | '503643256409030657'
198 | '503643259571167232'
199 | '503643260590759936'
200 | '503643261315985408'
201 | '503643262541111296'
202 | '503643263018872832'
203 | '503643264348467200'
204 | '503643266764374016'
205 | '503643266907402240'
206 | '503643271743037441'
207 | '503643274196684800'
208 | '503643274926518272'
209 | '503643275266228224'
210 | '503643275643719680'
211 | '503643276335779840'
212 | '503643276499353600'
213 | '503643276654559233'
214 | '503643278441324544'
215 | '503643278697189378'
216 | '503643279083061248'
217 | '503643280400453633'
218 | '503643280475582464'
219 | '503643283294142464'
220 | '503643284174929920'
221 | '503643285572055042'
222 | '503643287706931201'
223 | '503643287954001920'
224 | '503643288612913153'
225 | '503643289317552128'
226 | '503643290307403776'
227 | '503643291150458880'
228 | '503643291167248384'
229 | '503643291758260224'
230 | '503643291859292161'
231 | '503643292505210880'
232 | '503643293176332289'
233 | '503643293406609409'
234 | '503643293880963072'
235 | '503643294573006849'
236 | '503643295218958338'
237 | '503643296577880064'
238 | '503643297685200896'
239 | '503643298310148097'
240 | '503643298657861632'
241 | '503643298918334465'
242 | '503643299329359872'
243 | '503643299996241920'
244 | '503643300877058049'
245 | '503643301535551490'
246 | '503643302130753536'
247 | '503643302147915777'
248 | '503643302915489795'
249 | '503643303490109440'
250 | '503643304039575552'
251 | '503643304790360065'
252 | '503643305352372224'
253 | '503643306291896320'
254 | '503643306971398144'
255 | '503643307071639552'
256 | '503643307684421632'
257 | '503643308460347392'
258 | '503643309190156288'
259 | '503643309622181889'
260 | '503643309903183873'
261 | '503643310557503488'
262 | '503643311337639936'
263 | '503643312063258625'
264 | '503643314424655872'
265 | '503643315548323840'
266 | '503643317330903040'
267 | '503643317704212480'
268 | '503643317855207424'
269 | '503643318560239616'
270 | '503643318937739265'
271 | '503643319713673217'
272 | '503643320405725184'
273 | '503643321248395264'
274 | '503643321412378624'
275 | '503643321823420416'
276 | '503643323173969920'
277 | '503643323236904961'
278 | '503643323844685825'
279 | '503643324017020928'
280 | '503643325715730432'
281 | '503643325761851392'
282 | '503643326789062657'
283 | '503643326990778368'
284 | '503643327632519168'
285 | '503643328320385025'
286 | '503643329922215937'
287 | '503643330543357952'
288 | '503643330551775232'
289 | '503643331080257537'
290 | '503643331281555456'
291 | '503643331696803840'
292 | '503643332015554560'
293 | '503643332229480448'
294 | '503643333470982144'
295 | '503643334280482816'
296 | '503643334938988544'
297 | '503643335857569792'
298 | '503643337199718400'
299 | '503643338332180482'
300 | '503643339187830784'
301 | '503643339392966657'
302 | '503643340228014081'
303 | '503643341205299200'
304 | '503643341880565760'
305 | '503643341905752064'
306 | '503643342195138563'
307 | '503643342585233408'
308 | '503643343151448065'
309 | '503643343197577216'
310 | '503643344644239360'
311 | '503643344653012992'
312 | '503643344791031808'
313 | '503643344946601984'
314 | '503643345496047616'
315 | '503643346938523649'
316 | '503643347807129602'
317 | '503643349119549440'
318 | '503643349224804353'
319 | '503643349954613248'
320 | '503643350701191168'
321 | '503643352106299392'
322 | '503643353146064896'
323 | '503643353863294976'
324 | '503643354677407744'
325 | '503643355310743552'
326 | '503643355990220800'
327 | '503643356715835392'
328 | '503643357495963648'
329 | '503643357688500225'
330 | '503643358284509184'
331 | '503643358934605824'
332 | '503643359668604928'
333 | '503643359794434049'
334 | '503643360381652992'
335 | '503643360469725184'
336 | '503643360557412352'
337 | '503643361254047744'
338 | '503643362030026754'
339 | '503643362075742208'
340 | '503643362658750464'
341 | '503643363632226304'
342 | '503643364265193472'
343 | '503643364286550017'
344 | '503643364827598848'
345 | '503643365217693697'
346 | '503643365771341825'
347 | '503643365850628097'
348 | '503643366446608384'
349 | '503643366580424704'
350 | '503643368237580288'
351 | '503643448092942337'
352 | '503643449279938561'
353 | '503643449988751361'
354 | '503643450034491392'
355 | '503643450680803328'
356 | '503643451465162752'
357 | '503643451490324480'
358 | '503643452140453889'
359 | '503643452324581376'
360 | '503643452748615681'
361 | '503643453419311104'
362 | '503643453633613825'
363 | '503643455734956032'
364 | '503643456133419008'
365 | '503643456489943040'
366 | '503643456900972544'
367 | '503643457186168832'
368 | '503643458033438720'
369 | '503643458708717568'
370 | '503643459379789825'
371 | '503643460101230594'
372 | '503643460726173696'
373 | '503643460797485056'
374 | '503643461405663232'
375 | '503643461602775040'
376 | '503643461690466304'
377 | '503643462370328576'
378 | '503643463125311488'
379 | '503643464047681536'
380 | '503643464899518464'
381 | '503643465667076096'
382 | '503643465729581056'
383 | '503643465792507904'
384 | '503643465826058240'
385 | '503643466073915393'
386 | '503643466531102720'
387 | '503643467134664704'
388 | '503643467160248320'
389 | '503643467869085696'
390 | '503643468510822400'
391 | '503643469228019712'
392 | '503643469982605312'
393 | '503643470628937728'
394 | '503643471262285824'
395 | '503643471304216577'
396 | '503643472080171008'
397 | '503643472445071361'
398 | '503643472654393344'
399 | '503643473178673152'
400 | '503643473434906624'
401 | '503643474122788864'
402 | '503643474768703489'
403 | '503643475544645632'
404 | '503643476207337472'
405 | '503643476899405824'
406 | '503643477683752961'
407 | '503643478166102017'
408 | '503643479025909761'
409 | '503643479734755329'
410 | '503643480372314112'
411 | '503643481244725248'
412 | '503643481328611328'
413 | '503643482029047808'
414 | '503643482456457216'
415 | '503643482733682688'
416 | '503643483442532353'
417 | '503643484646277121'
418 | '503643485035978752'
419 | '503643485418037249'
420 | '503643485493559299'
421 | '503643486076538881'
422 | '503643486516559872'
423 | '503643486835712000'
424 | '503643486923403264'
425 | '503643486956957697'
426 | '503643487531978752'
427 | '503643488219852800'
428 | '503643489037737984'
429 | '503643489742385152'
430 | '503643489956290560'
431 | '503643490316607488'
432 | '503643490967109632'
433 | '503643491143282688'
434 | '503643491852111872'
435 | '503643493286567936'
436 | '503643493961838592'
437 | '503643494725197824'
438 | '503643494846836736'
439 | '503643495815720960'
440 | '503643497426341888'
441 | '503643497426350081'
442 | '503643497703153664'
443 | '503643499137609728'
444 | '503643499808718850'
445 | '503643500576272384'
446 | '503643501255720961'
447 | '503643501943615488'
448 | '503643502606311426'
449 | '503643502723735553'
450 | '503643503726178305'
451 | '503643504493731840'
452 | '503643505026011136'
453 | '503643505336782848'
454 | '503643506100150274'
455 | '503643506141708288'
456 | '503643507215839235'
457 | '503643507865968641'
458 | '503643508537049091'
459 | '503643508549623808'
460 | '503643509052932097'
461 | '503643509292023808'
462 | '503643509929558016'
463 | '503643510911021056'
464 | '503643511355617280'
465 | '503643512160923649'
466 | '503643512467103744'
467 | '503643512932667393'
468 | '503643514321002498'
469 | '503643514505527297'
470 | '503643515738673152'
471 | '503643516174868480'
472 | '503643516883709952'
473 | '503643517512851456'
474 | '503643517907111936'
475 | '503643518276206593'
476 | '503643518922145792'
477 | '503643519719071746'
478 | '503643521140928512'
479 | '503643521807831041'
480 | '503643522986045440'
481 | '503643523296821248'
482 | '503643524487577600'
483 | '503643524840308737'
484 | '503643525712736256'
485 | '503643526375407616'
486 | '503643527101034497'
487 | '503643527746949120'
488 | '503643528447016960'
489 | '503643528455790592'
490 | '503643528732221440'
491 | '503643529324036097'
492 | '503643529382748163'
493 | '503643529621823488'
494 | '503643529697320960'
495 | '503643532541046784'
496 | '503643533647949825'
497 | '503643535044653056'
498 | '503643535057227776'
499 | '503643535112163328'
500 | '503643536106196992'
--------------------------------------------------------------------------------