├── README.md ├── gdelt_miner.sh ├── gdelt.sql └── gdelt_transfer.py /README.md: -------------------------------------------------------------------------------- 1 | # gdelt_mining 2 | My own little scripts to consume and analyze the GDELT project's data 3 | 4 | ##To run scripts 5 | * Go through scripts and change path, host, username, password, etc. values 6 | * There are a few required python packages, please refer to imports for complete list 7 | 8 | ##GDELT Documentation 9 | * Gdelt 2.0: http://blog.gdeltproject.org/gdelt-2-0-our-global-world-in-realtime/ 10 | * Get latest gdelt files every 15 minutes at http://data.gdeltproject.org/gdeltv2/lastupdate.txt 11 | 12 | ##Scripts to consume event data 13 | * gdelt.sql to create table in mysql 14 | * gdelt_miner.sh to pull csv file every 15 minutes (you should set up a cron job) 15 | * gdelt_transfer.py to transfer the filtered events to mysql table (again, cron job) -------------------------------------------------------------------------------- /gdelt_miner.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #First remove the event files 4 | sudo rm /PATH/TO/latest_gdelt_events.csv 5 | sudo rm /PATH/TO/latest_gdelt_events.csv.zip 6 | 7 | 8 | #Get url towards latest GDELT update 9 | content_regex="export.CSV.zip" 10 | content=$(curl -v --silent http://data.gdeltproject.org/gdeltv2/lastupdate.txt --stderr - | grep $content_regex) 11 | 12 | IFS=' ' read -a content_components <<< "$content" 13 | latest_gdelt_url="${content_components[2]}" 14 | 15 | 16 | #Get name of compressed file 17 | IFS='/' read -a url_components <<< "$latest_gdelt_url" 18 | compressed_file_name="${url_components[4]}" 19 | 20 | 21 | #Get name of csv file 22 | IFS='.' read -a file_components <<< "$compressed_file_name" 23 | csv_file_name="${file_components[0]}.${file_components[1]}.${file_components[2]}" 24 | 25 | 26 | #Download and extract latest events 27 | curl $latest_gdelt_url > /PATH/TO/latest_gdelt_events.csv.zip 28 | unzip -p "/PATH/TO/latest_gdelt_events.csv.zip" $csv_file_name > /PATH/TO/latest_gdelt_events.csv -------------------------------------------------------------------------------- /gdelt.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `gdelt` ( 2 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 3 | `gdelt_id` int(11) DEFAULT NULL, 4 | `published_date` datetime NOT NULL, 5 | `insert_date` datetime NOT NULL, 6 | `url` varchar(200) NOT NULL DEFAULT '', 7 | `actor1_code` varchar(20) DEFAULT NULL, 8 | `actor1_name` varchar(50) DEFAULT NULL, 9 | `actor1_country_code` varchar(3) DEFAULT NULL, 10 | `actor1_known_group_code` varchar(20) DEFAULT NULL, 11 | `actor1_ethnic_code` varchar(20) DEFAULT NULL, 12 | `actor1_religion1_code` varchar(20) DEFAULT NULL, 13 | `actor1_religion2_code` varchar(20) DEFAULT NULL, 14 | `actor1_type1_code` varchar(3) DEFAULT NULL, 15 | `actor1_type2_code` varchar(3) DEFAULT NULL, 16 | `actor1_type3_code` varchar(3) DEFAULT NULL, 17 | `actor2_code` varchar(20) DEFAULT NULL, 18 | `actor2_name` varchar(50) DEFAULT NULL, 19 | `actor2_country_code` varchar(3) DEFAULT NULL, 20 | `actor2_known_group_code` varchar(20) DEFAULT NULL, 21 | `actor2_ethnic_code` varchar(20) DEFAULT NULL, 22 | `actor2_religion1_code` varchar(20) DEFAULT NULL, 23 | `actor2_religion2_code` varchar(20) DEFAULT NULL, 24 | `actor2_type1_code` varchar(3) DEFAULT NULL, 25 | `actor2_type2_code` varchar(3) DEFAULT NULL, 26 | `actor2_type3_code` varchar(3) DEFAULT NULL, 27 | `is_root_event` tinyint(1) DEFAULT NULL, 28 | `event_code` varchar(20) DEFAULT NULL, 29 | `event_base_code` varchar(20) DEFAULT NULL, 30 | `event_root_code` varchar(20) DEFAULT NULL, 31 | `quad_class` smallint(6) DEFAULT NULL, 32 | `goldstein_scale` smallint(6) DEFAULT NULL, 33 | `num_mentions` smallint(6) DEFAULT NULL, 34 | `num_sources` smallint(6) DEFAULT NULL, 35 | `num_articles` smallint(6) DEFAULT NULL, 36 | `avg_tone` smallint(6) DEFAULT NULL, 37 | `actor1_geo_type` smallint(6) DEFAULT NULL, 38 | `actor1_geo_full_name` varchar(50) DEFAULT NULL, 39 | `actor1_geo_country_code` varchar(2) DEFAULT '', 40 | `actor1_geo_adm1_code` varchar(4) DEFAULT NULL, 41 | `actor1_geo_adm2_code` varchar(5) DEFAULT NULL, 42 | `actor1_geo_lat` float DEFAULT NULL, 43 | `actor1_geo_long` float DEFAULT NULL, 44 | `actor1_geo_feature_id` varchar(10) DEFAULT NULL, 45 | `actor2_geo_type` smallint(6) DEFAULT NULL, 46 | `actor2_geo_full_name` varchar(50) DEFAULT NULL, 47 | `actor2_geo_country_code` varchar(2) DEFAULT '', 48 | `actor2_geo_adm1_code` varchar(4) DEFAULT NULL, 49 | `actor2_geo_adm2_code` varchar(5) DEFAULT NULL, 50 | `actor2_geo_lat` float DEFAULT NULL, 51 | `actor2_geo_long` float DEFAULT NULL, 52 | `actor2_geo_feature_id` varchar(10) DEFAULT NULL, 53 | `action_geo_type` smallint(6) DEFAULT NULL, 54 | `action_geo_full_name` varchar(50) DEFAULT NULL, 55 | `action_geo_country_code` varchar(2) DEFAULT NULL, 56 | `action_geo_adm1_code` varchar(4) DEFAULT NULL, 57 | `action_geo_adm2_code` varchar(5) DEFAULT NULL, 58 | `action_geo_lat` float DEFAULT NULL, 59 | `action_geo_long` float DEFAULT NULL, 60 | `action_geo_feature_id` varchar(10) DEFAULT NULL, 61 | PRIMARY KEY (`id`) 62 | ) ENGINE=InnoDB AUTO_INCREMENT=3912 DEFAULT CHARSET=utf8; -------------------------------------------------------------------------------- /gdelt_transfer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import MySQLdb 3 | import datetime 4 | 5 | ############################################## 6 | # READ EVENTS FROM LOCAL CSV AND ONLY # 7 | # KEEP PROTEST EVENTS # 8 | ############################################## 9 | def getGDELTEvents(): 10 | gdelt_events = [] 11 | 12 | with open('/PATH/TO/latest_gdelt_events.csv', 'rb') as gdelt_events_csv: 13 | gdelt_reader = csv.reader(gdelt_events_csv, delimiter='\t') 14 | 15 | for event in gdelt_reader: 16 | #This is my own set of filters as I was interested in protest events 17 | if event[28] == "14": 18 | gdelt_events.append(event) 19 | 20 | return gdelt_events 21 | 22 | 23 | 24 | ############################################## 25 | # COMMIT PROTEST EVENTS TO DB # 26 | ############################################## 27 | def commitGDELTEvents(my_gdelt_events): 28 | # Open database connection 29 | conn = MySQLdb.connect("HOST", 30 | "USERNAME", 31 | "PASSWORD", 32 | "DATABASE") 33 | 34 | # prepare cursors 35 | cur = conn.cursor() 36 | 37 | # Iterate through events 38 | for event in my_gdelt_events: 39 | # Upload event 40 | add_event = ("INSERT INTO gdelt " 41 | "(gdelt_id, published_date, insert_date, url, " 42 | "actor1_code, actor1_name, actor1_country_code, actor1_known_group_code, actor1_ethnic_code, actor1_religion1_code, actor1_religion2_code, actor1_type1_code, actor1_type2_code, actor1_type3_code, " 43 | "actor2_code, actor2_name, actor2_country_code, actor2_known_group_code, actor2_ethnic_code, actor2_religion1_code, actor2_religion2_code, actor2_type1_code, actor2_type2_code, actor2_type3_code, " 44 | "is_root_event, event_code, event_base_code, event_root_code, quad_class, " 45 | "goldstein_scale, num_mentions, num_sources, num_articles, avg_tone, " 46 | "actor1_geo_type, actor1_geo_full_name, actor1_geo_country_code, actor1_geo_adm1_code, actor1_geo_adm2_code, actor1_geo_lat, actor1_geo_long, actor1_geo_feature_id, " 47 | "actor2_geo_type, actor2_geo_full_name, actor2_geo_country_code, actor2_geo_adm1_code, actor2_geo_adm2_code, actor2_geo_lat, actor2_geo_long, actor2_geo_feature_id, " 48 | "action_geo_type, action_geo_full_name, action_geo_country_code, action_geo_adm1_code, action_geo_adm2_code, action_geo_lat, action_geo_long, action_geo_feature_id) " 49 | "VALUES (%s, %s, %s, %s, " 50 | "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " 51 | "%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, " 52 | "%s, %s, %s, %s, %s, " 53 | "%s, %s, %s, %s, %s, " 54 | "%s, %s, %s, %s, %s, %s, %s, %s, " 55 | "%s, %s, %s, %s, %s, %s, %s, %s, " 56 | "%s, %s, %s, %s, %s, %s, %s, %s)") 57 | 58 | cur.execute(add_event, [event[0] if event[0] != "" else None, event[1] if event[1] != "" else None, datetime.datetime.now(), event[60] if event[60] != "" else None, 59 | event[5] if event[5] != "" else None, event[6] if event[6] != "" else None, event[7] if event[7] != "" else None, event[8] if event[8] != "" else None, event[9] if event[9] != "" else None, event[10] if event[10] != "" else None, event[11] if event[11] != "" else None, event[12] if event[12] != "" else None, event[13] if event[13] != "" else None, event[14] if event[14] != "" else None, 60 | event[15] if event[15] != "" else None, event[16] if event[16] != "" else None, event[17] if event[17] != "" else None, event[18] if event[18] != "" else None, event[19] if event[19] != "" else None, event[20] if event[20] != "" else None, event[21] if event[21] != "" else None, event[22] if event[22] != "" else None, event[23] if event[23] != "" else None, event[24] if event[24] != "" else None, 61 | event[25] if event[25] != "" else None, event[26] if event[26] != "" else None, event[27] if event[27] != "" else None, event[28] if event[28] != "" else None, event[29] if event[29] != "" else None, 62 | event[30] if event[30] != "" else None, event[31] if event[31] != "" else None, event[32] if event[32] != "" else None, event[33] if event[33] != "" else None, event[34] if event[34] != "" else None, 63 | event[35] if event[35] != "" else None, event[36] if event[36] != "" else None, event[37] if event[37] != "" else None, event[38] if event[38] != "" else None, event[39] if event[39] != "" else None, event[40] if event[40] != "" else None, event[41] if event[41] != "" else None, event[42] if event[42] != "" else None, 64 | event[43] if event[43] != "" else None, event[44] if event[44] != "" else None, event[45] if event[45] != "" else None, event[46] if event[46] != "" else None, event[47] if event[47] != "" else None, event[48] if event[48] != "" else None, event[49] if event[49] != "" else None, event[50] if event[50] != "" else None, 65 | event[51] if event[51] != "" else None, event[52] if event[52] != "" else None, event[53] if event[53] != "" else None, event[54] if event[54] != "" else None, event[55] if event[55] != "" else None, event[56] if event[56] != "" else None, event[57] if event[57] != "" else None, event[58] if event[58] != "" else None]) 66 | 67 | conn.commit() 68 | 69 | 70 | # disconnect from server 71 | conn.close() 72 | 73 | 74 | 75 | if __name__ == "__main__": 76 | gdelt_events = getGDELTEvents() 77 | commitGDELTEvents(gdelt_events) --------------------------------------------------------------------------------