├── requirements.txt ├── out_files ├── rows_with_no_files.txt ├── governments_1989onwards.csv ├── files_with_content_problems.txt └── extra_roles_manually_collected.csv ├── src ├── csv_concat.py ├── add_gender_to_members.py ├── web_crawler_for_parliament_members.py ├── fill_proedr_names.py ├── web_crawler_for_proceeding_files.py ├── web_crawler_for_proceeding_files_old.py ├── convert2txt.py ├── greek_name_cases_wiki_crawler.py ├── join_members_activity.py ├── web_crawler_for_government_members.py ├── parl_members_data_cleaner.py ├── member_speech_matcher.py └── gov_members_data_cleaner.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.1 2 | jellyfish==0.8.2 3 | json5==0.9.5 4 | numpy==1.19.1 5 | pandas==1.1.1 6 | Pygments==2.7.1 7 | pygtrie==2.3.3 8 | python-dateutil==2.8.1 9 | requests==2.24.0 10 | selenium==3.141.0 11 | urllib3==1.25.10 -------------------------------------------------------------------------------- /out_files/rows_with_no_files.txt: -------------------------------------------------------------------------------- 1 | Page 413 and date 26/01/1994 2 | Page 407 and date 17/05/1994 3 | Page 404 and date 13/07/1994 4 | Page 375 and date 09/01/1998 (πρωί) 5 | Page 374 and date 05/02/1998 (πρωί) 6 | Page 369 and date 26/05/1998 7 | Page 363 and date 29/09/1998 8 | Page 363 and date 23/09/1998 9 | Page 363 and date 15/09/1998 10 | Page 290 and date 28/07/2002 11 | Page 259 and date 07/06/2004 12 | Page 206 and date 13/02/2007 13 | -------------------------------------------------------------------------------- /src/csv_concat.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import os 4 | 5 | '''This script concatenates the outputs of member_speech_matcher.py in case 6 | the latter ran in parallel on multiple different batches of the record files.''' 7 | dir = '../tell_all_batches/' 8 | 9 | with open('../out_files/tell_all_final.csv', 'w+', encoding='utf-8', newline = '') as outfile: 10 | 11 | for i,filepath in enumerate(sorted([os.path.join(os.path.abspath(dir), name) 12 | for name in os.listdir(dir) 13 | if not name.startswith('.')])): 14 | print(filepath) 15 | if i == 0: 16 | combined_df = pd.read_csv(filepath, encoding='utf-8') 17 | else: 18 | combined_df = pd.concat([combined_df, pd.read_csv(filepath, encoding='utf-8')], 19 | ignore_index=True) 20 | 21 | combined_df.to_csv(outfile) 22 | -------------------------------------------------------------------------------- /out_files/governments_1989onwards.csv: -------------------------------------------------------------------------------- 1 | gov_name,date_from,date_to,gov_url 2 | τζαννετακη τζαννη,1989-07-02,1989-10-12,https://gslegal.gov.gr/?p=1294 3 | γριβα ιωαννη,1989-10-12,1989-11-23,https://gslegal.gov.gr/?p=1297 4 | ζολωτα ξενοφωντα,1989-11-23,1990-04-11,https://gslegal.gov.gr/?p=1301 5 | μητσοτακη κωνσταντινου,1990-04-11,1993-10-13,https://gslegal.gov.gr/?p=1304 6 | παπανδρεου ανδρεα,1993-10-13,1996-01-22,https://gslegal.gov.gr/?p=1307 7 | σημιτη κωνσταντινου,1996-01-22,1996-09-25,https://gslegal.gov.gr/?p=1310 8 | σημιτη κωνσταντινου,1996-09-25,2000-04-13,https://gslegal.gov.gr/?p=1315 9 | σημιτη κωνσταντινου,2000-04-13,2004-03-10,https://gslegal.gov.gr/?p=1320 10 | καραμανλη α. κωνσταντινου,2004-03-10,2007-09-19,https://gslegal.gov.gr/?p=1323 11 | καραμανλη α. κωνσταντινου,2007-09-19,2009-10-07,https://gslegal.gov.gr/?p=1328 12 | παπανδρεου α. γεωργιου,2009-10-06,2011-11-11,https://gslegal.gov.gr/?p=1335 13 | παπαδημου λουκα δ.,2011-11-11,2012-05-17,https://gslegal.gov.gr/?p=2299 14 | πικραμμενου παναγιωτη οθ. (υπηρεσιακη),2012-05-17,2012-06-21,https://gslegal.gov.gr/?p=2788 15 | σαμαρα κ. αντωνιου,2012-06-21,2015-01-26,https://gslegal.gov.gr/?p=2880 16 | τσιπρα π. αλεξιου,2015-01-26,2015-08-27,https://gslegal.gov.gr/?p=4396 17 | θανου-χριστοφιλου βασιλικης (υπηρεσιακη),2015-08-27,2015-09-21,https://gslegal.gov.gr/?p=4888 18 | τσιπρα π. αλεξιου,2015-09-21,2019-07-08,https://gslegal.gov.gr/?p=4915 19 | μητσοτακη κυριακου,2019-07-08,2020-07-28,https://gslegal.gov.gr/?p=6637 20 | -------------------------------------------------------------------------------- /src/add_gender_to_members.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | 4 | with open('../out_files/female_names_alternatives_gr.txt', 'r+', encoding = 'utf-8') as f1,\ 5 | open('../out_files/male_names_alternatives_gr.txt', 'r+', encoding = 'utf-8') as f2: 6 | 7 | female_list = re.split(r'[,\n\s*]', f1.read()) 8 | male_list = re.split(r'[,\n\s*]', f2.read()) 9 | 10 | male_list.extend(['διακος', 'τσετιν', 'σπυροπανος', 'σπυριδωνας', 'τερενς', 11 | 'αιχαν', 'χουσειν', 'πυρρος', 'γκαληπ', 'μπηρολ', 'φιντιας', 12 | 'τραιανος', 'αχμετ', 'αθηναιος', 'φρανς', 'τζαννης', 13 | 'ροβερτος', 'μουσταφα', 'κλεων', 'παρισης', 'παυσανιας', 14 | 'μεχμετ', 'αμετ', 'μπουρχαν', 'πανουργιας', 'γιανης', 15 | 'ιλχαν', 'πυθαγορας', 'φραγκλινος', 'ισμαηλ', 'θαλασσινος']) 16 | female_list.extend(['ελεωνορα', 'κρινιω', 'ιωαννετα', 'σουλτανα', 'ηρω', 17 | 'συλβα', 'χρυσουλα', 'ελισσαβετ', 'βιργινια', 'ροδουλα', 18 | 'καλλιοπη', 'γεσθημανη', 'φερονικη', 'χρυση', 'ολυμπια', 19 | 'καλλιοπη']) 20 | 21 | # remove empty strings 22 | female_list = list(filter(None, female_list)) 23 | male_list = list(filter(None, male_list)) 24 | 25 | # keep names used for both males and females 26 | unisex_names = list(set(male_list).intersection(female_list)) 27 | 28 | df = pd.read_csv('../out_files/parl_members_activity_1989onwards.csv') 29 | 30 | df['gender'] = '' 31 | 32 | for index, row in df.iterrows(): 33 | 34 | first_name = (row['member_name'].split(' ')[2]).lower() 35 | if '-' in first_name: 36 | first_name = first_name.split('-')[0] 37 | 38 | if first_name in unisex_names: 39 | print('check manually case of ', row['member_name', '. The name is unisex.']) 40 | elif first_name in female_list: 41 | row['gender'] = 'female' 42 | elif first_name in male_list: 43 | row['gender'] = 'male' 44 | else: 45 | print('Name not categorized in any gender: ', row['member_name']) 46 | 47 | df.to_csv('../out_files/parl_members_activity_1989onwards_with_gender.csv', header=True, index=False, encoding='utf-8') 48 | -------------------------------------------------------------------------------- /src/web_crawler_for_parliament_members.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | import re 4 | import csv 5 | import datetime 6 | from selenium import webdriver 7 | import time 8 | 9 | now = datetime.datetime.now() 10 | 11 | with open('../out_files/original_parl_members_data.csv','w+', 12 | encoding='utf-8') as original_members_data: 13 | 14 | csv_writer = csv.writer(original_members_data, delimiter=',') 15 | 16 | _URL = 'http://www.hellenicparliament.gr/Vouleftes/Diatelesantes' \ 17 | '-Vouleftes-Apo-Ti-Metapolitefsi-Os-Simera/' 18 | 19 | # chromedriver.exe located in the same folder as the script 20 | driver = webdriver.Chrome('./chromedriver') 21 | time.sleep(5) 22 | driver.get(_URL) 23 | time.sleep(5) #page load time 24 | html = driver.page_source 25 | soup = BeautifulSoup(html, "html.parser") 26 | time.sleep(2) 27 | 28 | #Get dropdown list 29 | members_dropdown = soup.find("select", id="ctl00_ContentPlaceHolder1_dmps_mpsListId") 30 | 31 | members_links={} 32 | 33 | #Store all links and members' names in a dictionary 34 | for option in members_dropdown.find_all('option'): 35 | members_links[option['value']] = option.get_text() 36 | 37 | print('Total list length: ', (len(members_links)-1)) 38 | 39 | member_counter=0 40 | 41 | for link, member in members_links.items(): 42 | 43 | if member!='' and member!=' Επιλέξτε Βουλευτή': 44 | 45 | member_counter+=1 46 | if member_counter%50==0: 47 | time.sleep(15) 48 | print(str(member_counter)+' from '+str(len(members_links)-1)) 49 | print("Name: ",member) 50 | member_URL = _URL+'?MpId='+link 51 | print("Processing page ",member_URL,"\n") 52 | 53 | driver.get(member_URL) 54 | time.sleep(3) 55 | html = driver.page_source 56 | soup_member = BeautifulSoup(html, "html.parser") 57 | 58 | #if the page has no table 59 | if not soup_member.find("tbody"): 60 | original_members_data.write('No:'+str(member_counter)+',Name:'+member+',NO DATA\n') 61 | 62 | else: 63 | trs = soup_member.find("tbody").find_all("tr", {"class":["odd", "even"]}) 64 | 65 | for tr in trs: 66 | 67 | td_columns = [td.getText() for td in tr.find_all("td")] 68 | 69 | period = td_columns[0] 70 | period = re.sub(r"\s+", "", period) 71 | if '-)' in period: 72 | 73 | # for example Period:ΙΖ΄(20/09/2015-) means it continues up to today 74 | period = re.sub('-\)', '-'+ now.strftime("%d/%m/%Y")+')', period) 75 | 76 | date = td_columns[1] 77 | date = re.sub(r"\s+", "", date) 78 | 79 | administrative_region = td_columns[2] 80 | administrative_region = re.sub(r"\s+", "", administrative_region) 81 | 82 | parliamentary_party = td_columns[3] 83 | parliamentary_party = re.sub(r"\s+", "", parliamentary_party) 84 | 85 | description = td_columns[4] 86 | description = re.sub(r"\s+", "", description) 87 | 88 | csv_writer.writerow(['No:'+str(member_counter), 89 | 'Name:'+member, 90 | 'Period:'+period, 91 | 'Date:'+date, 92 | 'Administrative-Region:'+administrative_region, 93 | 'Parliamentary-Party:'+parliamentary_party, 94 | 'Description:'+description]) 95 | 96 | driver.close() -------------------------------------------------------------------------------- /src/fill_proedr_names.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | # Input file created by member_speech_matcher.py 5 | df = pd.read_csv('../out_files/tell_all.csv', encoding='utf-8') 6 | 7 | # Delete columns not needed for creating reference df_proedr 8 | subdf = df.copy() 9 | del subdf['speech'] 10 | del subdf['parliamentary_session'] 11 | del subdf['parliamentary_period'] 12 | 13 | # Choose rows with only proedros and proedreuon speaker info, as search reference 14 | df_proedr = subdf.loc[((subdf['speaker_info'] == 'προεδρευων') | 15 | (subdf['speaker_info'] == 'προεδρος') | 16 | (subdf['speaker_info'] == 'προσωρινος προεδρος') 17 | )] 18 | 19 | ''' Group df so that each row has name(s) of proedreuon or proedros for specific 20 | date and sitting joined with a ' / ' 21 | note: in some cases multiple proedreuontes iterate within a long sitting''' 22 | df_grouped_proedr = df_proedr.groupby(['sitting_date', 'parliamentary_sitting', 23 | 'speaker_info', 'member_gender', 'government', 24 | 'member_region', 'roles', 'political_party'] 25 | )['member_name'].apply(lambda x: ' / '.join(set(x.dropna()))).reset_index() 26 | 27 | # Fill emptied cells with NaN value 28 | df_grouped_proedr = df_grouped_proedr.replace('', np.nan) 29 | 30 | 31 | ''' For each of the rows of the initial file that does not have a proedreuon or proedros name 32 | search in the new df of proedreuontes for the name and replace NaN with the actual name''' 33 | proedr_nan = 0 34 | 35 | for index, row in df.iterrows(): 36 | 37 | # if row member name is NaN 38 | if pd.isnull(row['member_name']): 39 | 40 | if (row['speaker_info'] == 'προεδρευων'): 41 | 42 | proedr_nan += 1 43 | 44 | # Get the respective row from reference dataframe df_grouped_proedr 45 | proedeuon_slice = df_grouped_proedr.loc[(df_grouped_proedr['sitting_date'] == row['sitting_date']) 46 | & (df_grouped_proedr['parliamentary_sitting'] == row['parliamentary_sitting']) 47 | & (df_grouped_proedr['speaker_info'] == 'προεδρευων')].reset_index() 48 | 49 | if not proedeuon_slice.empty: 50 | if not pd.isnull(proedeuon_slice['member_name'][0]): 51 | 52 | # Replace NaN with name, only if name is only one 53 | if len(proedeuon_slice['member_name'][0].split('/')) == 1: 54 | proedr_name = proedeuon_slice['member_name'][0] #row 0 column member_name 55 | df.loc[index, 'member_name'] = proedr_name 56 | df.loc[index, 'member_gender'] = proedeuon_slice['member_gender'][0] 57 | df.loc[index, 'government'] = proedeuon_slice['government'][0] 58 | df.loc[index, 'member_region'] = proedeuon_slice['member_region'][0] 59 | df.loc[index, 'roles'] = proedeuon_slice['roles'][0] 60 | df.loc[index, 'political_party'] = proedeuon_slice['political_party'][0] 61 | 62 | elif ((row['speaker_info'] == 'προεδρος') | (row['speaker_info'] == 'προσωρινος προεδρος')): 63 | 64 | proedr_nan += 1 65 | 66 | # Get the respective row from reference dataframe df_grouped_proedr 67 | proedros_slice = df_grouped_proedr.loc[(df_grouped_proedr['sitting_date'] == row['sitting_date']) 68 | & (df_grouped_proedr['parliamentary_sitting'] == row['parliamentary_sitting']) 69 | & ((df_grouped_proedr['speaker_info'] == 'προεδρος') | 70 | (df_grouped_proedr['speaker_info'] == 'προσωρινος προεδρος') 71 | )].reset_index() 72 | 73 | if not proedros_slice.empty: 74 | 75 | if not pd.isnull(proedros_slice['member_name'][0]): 76 | 77 | # Replace NaN with name, only if name is only one 78 | if len(proedros_slice['member_name'][0].split('/')) == 1: 79 | proedr_name = proedros_slice['member_name'][0] #row 0 column member_name 80 | df.loc[index, 'member_name'] = proedr_name 81 | df.loc[index, 'member_gender'] = proedros_slice['member_gender'][0] 82 | df.loc[index, 'government'] = proedros_slice['government'][0] 83 | df.loc[index, 'member_region'] = proedros_slice['member_region'][0] 84 | df.loc[index, 'roles'] = proedros_slice['roles'][0] 85 | df.loc[index, 'political_party'] = proedros_slice['political_party'][0] 86 | 87 | if index%5000 == 0: 88 | print(index) 89 | 90 | df.to_csv('../out_files/tell_all_FILLED.csv', index=False, na_rep=np.nan) 91 | 92 | print('All NaN proedr cells are: ', str(proedr_nan)) 93 | print('Done') 94 | -------------------------------------------------------------------------------- /src/web_crawler_for_proceeding_files.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | import time 4 | import os 5 | import shutil 6 | import codecs 7 | from selenium import webdriver 8 | import ntpath 9 | import urllib.parse 10 | 11 | def create_target_path(target_data_folder, tr, entry_counter, ext): 12 | 13 | td_columns = [td.getText().lower() for td in tr.find_all("td")[:4]] #get text from 4 first columns 14 | date = td_columns[0] 15 | period = td_columns[1] 16 | session = td_columns[2] 17 | sitting = td_columns[3] 18 | 19 | # For the selected table row return the value of the first column (date column) 20 | date = (date.replace("/", "-")).replace(" ", "") 21 | reversed_date = date[6:10]+date[5]+date[3:5]+date[2]+date[0:2] 22 | 23 | target_filename = reversed_date+'_'+ str(entry_counter)+"_"+period+"_"+session+"_"+sitting+"."+ext 24 | target_path = os.path.join(target_data_folder, target_filename) 25 | 26 | return(target_path) 27 | 28 | def download_file(driver, downloaded_data_folder, file_URL, target_path): 29 | 30 | element = driver.find_element_by_xpath('//a[@href="' + file_URL + '"]') 31 | driver.execute_script("arguments[0].scrollIntoView();", element) 32 | element.click() 33 | 34 | # wait for full download 35 | time.sleep(7) 36 | 37 | downloaded_file_path = os.path.join(downloaded_data_folder, ntpath.basename(file_URL)) 38 | 39 | #unquote: decode url to match downloaded filename (often in cases with Greek letters in filename) 40 | shutil.copy(urllib.parse.unquote(downloaded_file_path), target_path) # copy and rename file 41 | 42 | domain = "https://www.hellenicparliament.gr" 43 | _URL = 'https://www.hellenicparliament.gr/Praktika/Synedriaseis-Olomeleias?pageNo=' 44 | url_part = "/UserFiles/" 45 | entry_counter = 0 #counter number is included to the name of each file 46 | 47 | downloaded_data_folder = '../original_data_download_folder/' 48 | if not os.path.exists(downloaded_data_folder): 49 | os.makedirs(downloaded_data_folder) 50 | 51 | target_data_folder = '../original_data/' 52 | if not os.path.exists(target_data_folder): 53 | os.makedirs(target_data_folder) 54 | 55 | # set preferred download folder 56 | chromeOptions = webdriver.ChromeOptions() 57 | prefs = {"profile.default_content_settings.popups": 0, 58 | "download.default_directory" : os.path.abspath(downloaded_data_folder), 59 | "directory_upgrade": True} 60 | chromeOptions.add_experimental_option("prefs",prefs) 61 | 62 | # chromedriver.exe located in the same folder as the script 63 | driver = webdriver.Chrome('./chromedriver', options=chromeOptions) 64 | 65 | #Open a file in order to write down the rows with no files 66 | with codecs.open('../out_files/rows_with_no_files.txt','w+', encoding='utf-8') as no_files: 67 | 68 | # Choose range of pages 69 | for pageNo in range (100,0,-1): 70 | 71 | print('Sleeping 5 seconds') 72 | page_URL = _URL+str(pageNo) 73 | print("Processing page",pageNo,"\n") 74 | driver.get(page_URL) 75 | time.sleep(5) 76 | 77 | html = driver.page_source 78 | soup = BeautifulSoup(html, "html.parser") 79 | trs = soup.find("tbody").find_all("tr", {"class":["odd", "even"]}) 80 | 81 | for tr in trs: 82 | 83 | entry_counter += 1 84 | print("No. ", entry_counter) 85 | 86 | files={} #dictionary with file extensions as keys and their links as values 87 | 88 | # From each table row return all the links 89 | for link in tr.findAll('a', href=True): 90 | 91 | href = link.get('href') 92 | 93 | # Keep the links that lead to the requested files and the 94 | # corresponding filetypes 95 | if url_part in href: 96 | files.update({(href.split(".")[-1]).lower(): href}) 97 | 98 | if len(files)==0: 99 | no_files.write('Page ' + str(pageNo) + " and date " + tr.find( 100 | 'td').getText() + " \n") 101 | print('File not found') 102 | else: 103 | # Download the file with the following preference order 104 | if "txt" in (ext.lower() for ext in files.keys()): 105 | file_ext = 'txt' 106 | elif "docx" in (ext.lower() for ext in files.keys()): 107 | file_ext = 'docx' 108 | elif "doc" in (ext.lower() for ext in files.keys()): 109 | file_ext = 'doc' 110 | elif "pdf" in (ext.lower() for ext in files.keys()): 111 | file_ext = 'pdf' 112 | 113 | file_URL = files[file_ext] 114 | print("File url: ", file_URL) 115 | 116 | target_path = create_target_path(target_data_folder, tr, entry_counter, file_ext) 117 | 118 | download_file(driver, downloaded_data_folder, file_URL, target_path) 119 | 120 | # Add sleeping time for the script every 50 files 121 | if ((entry_counter != 0) and ((entry_counter % 100) == 0)): 122 | print("Let's sleep for 3 minutes...") 123 | time.sleep(180) 124 | print("Up and running again...") 125 | 126 | driver.close() -------------------------------------------------------------------------------- /src/web_crawler_for_proceeding_files_old.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from bs4 import BeautifulSoup 3 | import time 4 | import requests 5 | import codecs 6 | 7 | """ This script is currently not working properly since the Greek Parliament website 8 | installed cyber security software in 2020. However, a big part of the data for 9 | this dataset was collected with this script, before the installation of the new software. 10 | The script that succeeded this one is named web_crawler_for_proceeding_files.py """ 11 | 12 | def record_retrieval(files, downloaded_counter, href, tr, ext): 13 | 14 | downloaded_counter+=1 15 | print("No. ",downloaded_counter) 16 | file_URL = domain+href 17 | print("File url: ",file_URL) 18 | 19 | td_columns = [td.getText().lower() for td in tr.find_all("td")[:4]] #get text from 4 first columns 20 | date = td_columns[0] 21 | period = td_columns[1] 22 | session = td_columns[2] 23 | sitting = td_columns[3] 24 | 25 | # For the selected table row return the value of the first column (date column) 26 | date = (date.replace("/", "-")).replace(" ", "") 27 | reversed_date = date[6:10]+date[5]+date[3:5]+date[2]+date[0:2] 28 | 29 | if len(files)==0: 30 | get_html(file_URL, downloaded_counter,reversed_date, period, session, sitting, ext) 31 | 32 | else: 33 | get_file(file_URL, downloaded_counter,reversed_date, period, session, sitting, ext) 34 | 35 | #Add sleeping time for the script every 1000 files 36 | if ((downloaded_counter!=0) and ((downloaded_counter % 1000) == 0)): 37 | print("Let's sleep for 5 minutes...") 38 | time.sleep(300) 39 | print("Up and running again...") 40 | 41 | return downloaded_counter 42 | 43 | 44 | def get_html(file_URL, downloaded_counter,reversed_date, period, session, sitting, ext): 45 | 46 | nested_response = requests.get(file_URL) 47 | inner_html = nested_response.text 48 | soup2 = BeautifulSoup(inner_html, "html.parser") 49 | plain_content = soup2.find("span", id="ctl00_ContentPlaceHolder1_sri_lblBody").get_text(separator="\n") 50 | 51 | # segments separated with underscore 52 | path = '../original_data/'+reversed_date+"_"+str(downloaded_counter)+"_"+period+"_"+session+"_"+sitting+"."+ext 53 | with codecs.open(path,'w+', encoding='utf-8') as new_file: 54 | new_file.write(plain_content) 55 | 56 | print("Saved as "+path,"\n") 57 | 58 | 59 | def get_file(file_URL, downloaded_counter,reversed_date, period, session, sitting, ext): 60 | 61 | path = '../original_data/'+reversed_date+'_'+ str(downloaded_counter)+"_"+period+"_"+session+"_"+sitting+"."+ext 62 | 63 | filecontent = requests.get(file_URL) 64 | with open(path,'wb') as f: #open file in binary mode 65 | f.write(filecontent.content) 66 | f.close() 67 | 68 | print("Saved as "+path,"\n") 69 | 70 | domain = "http://www.hellenicparliament.gr" 71 | _URL = 'http://www.hellenicparliament.gr/Praktika/Synedriaseis-Olomeleias?pageNo=' 72 | url_part = "/UserFiles/" 73 | downloaded_counter=0 74 | 75 | #Open a file in order to write down the rows with no files 76 | no_files = codecs.open('../out_files/rows_with_no_files.txt','w+', encoding='utf-8') 77 | 78 | for pageNo in range(17, 0, -1): 79 | page_URL = _URL+str(pageNo) 80 | print("Processing page", pageNo, "\n") 81 | 82 | response = requests.get(page_URL) 83 | html = response.text #.content -> response as bytes, .text-> response as unicode/string 84 | soup = BeautifulSoup(html, "html.parser") 85 | 86 | # From the table body (tbody) take all table rows (trs) that should 87 | # contain record files, excluding footer row 88 | trs = soup.find("tbody").find_all("tr", {"class":["odd", "even"]}) 89 | 90 | for tr in trs: 91 | 92 | files={} #dictionary with file extensions as keys and their links as values 93 | 94 | # From each table row return all the links 95 | for link in tr.findAll('a', href=True): 96 | 97 | href = link.get('href') 98 | 99 | # Keep the links that lead to the requested files and the 100 | # corresponding filetypes 101 | if url_part in href: 102 | files.update({href.split(".")[-1]: href}) 103 | 104 | #If the row has no record files 105 | if len(files)==0: 106 | # write the page number & row date in the file rows_with_no_files.txt 107 | no_files.write('Page '+str(pageNo)+" and date "+tr.find('td').getText()+" \n") 108 | print("No file found. Opening ", href) 109 | downloaded_counter = record_retrieval(files, downloaded_counter, href, tr, 'txt') 110 | 111 | else: 112 | # Download the file with the following preference order 113 | if "txt" in (ext.lower() for ext in files.keys()): 114 | downloaded_counter = record_retrieval(files, downloaded_counter, href, tr, "txt") 115 | elif "docx" in (ext.lower() for ext in files.keys()): 116 | downloaded_counter = record_retrieval(files, downloaded_counter, href, tr, "docx") 117 | elif "doc" in (ext.lower() for ext in files.keys()): 118 | downloaded_counter = record_retrieval(files, downloaded_counter, href, tr, "doc") 119 | elif "pdf" in (ext.lower() for ext in files.keys()): 120 | downloaded_counter = record_retrieval(files, downloaded_counter, href, tr, "pdf") 121 | 122 | no_files.close() -------------------------------------------------------------------------------- /src/convert2txt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import subprocess 4 | import re 5 | import shutil 6 | from datetime import datetime as dt 7 | 8 | def greek_numerals_to_numbers(numeral): 9 | 10 | number = 0 11 | 12 | greek_numerals = {"α": 1,"a":1, "β": 2, "b":2, "γ": 3,"δ": 4, "ε": 5, "έ":5, "e":5, 13 | "στ": 6, "ζ": 7, "z":7,"η": 8, "ή":8, "h":8, "θ": 9, 14 | "ι": 10, "i":10, "κ": 20, "k":20, "λ": 30, "μ": 40, 15 | "m":40, "ν": 50, "n":50, "ξ": 60,"ο": 70, "o":70, "ό":70, 16 | "π": 80, "ϙ": 90, "ϟ":90, "ρ": 100,"p":100, "σ": 200, "τ": 300, 17 | "t":300, "υ": 400, "φ": 500, "χ": 600,"ψ": 700, "ω": 800, 18 | "ϡ": 900} 19 | 20 | numeral = re.sub("[΄'`’(); r'\d']","", numeral) 21 | 22 | numeral_letters = list(numeral.lower()) 23 | 24 | new_letters=[] 25 | 26 | for i in range(len(numeral_letters)): 27 | if numeral_letters[i] == 'σ': 28 | if i!=len(numeral_letters)-1: #if σ is not the last letter 29 | if numeral_letters[i+1]== 'τ': #if σ is followed by τ 30 | new_letters.append('στ') #join σ and τ 31 | else: 32 | new_letters.append(numeral_letters[i]) 33 | elif numeral_letters[i] == 'τ': 34 | pass 35 | else: 36 | new_letters.append(numeral_letters[i]) 37 | 38 | for letter in new_letters: 39 | number+= greek_numerals[letter] 40 | 41 | return str(number) 42 | 43 | 44 | datapath = "../original_data/" 45 | new_datapath = "../_data/" 46 | if not os.path.exists(new_datapath): 47 | os.makedirs(new_datapath) 48 | 49 | # ignore hidden files 50 | filenames = [f for f in os.listdir(datapath) if not f.startswith('.')] 51 | 52 | #Keep history of changes 53 | with open('../out_files/renaming_log.txt', 'wb') as renaming_log: 54 | 55 | counter=0 56 | 57 | for filename in filenames: 58 | 59 | #If filesize is zero, delete the file and do not copy it 60 | if os.path.getsize(datapath+filename) == 0: 61 | renaming_log.write( 62 | b'0 size file : ' + filename.encode("utf-8") + b'\n\n') 63 | 64 | os.remove(os.path.join(datapath, filename)) 65 | print('Filesize is zero. File removed.\n') 66 | 67 | 68 | #If filesize is not zero, rename, copy and convert it to text 69 | else: 70 | 71 | file_date = filename.split('_')[0] 72 | 73 | file_datetime_object = dt.strptime(file_date, '%Y-%m-%d') 74 | 75 | counter += 1 76 | print('File No. ', counter) 77 | 78 | segments = (re.sub("[΄'`’]", "'", os.path.splitext(filename)[0])).split('_') #segments of filename seperated with underscore 79 | part1 = '_'.join(segments[:2]) #Date and counter number 80 | 81 | #PERIOD 82 | 83 | period = segments[2] 84 | 85 | if period!='': 86 | period = re.sub(r'[\(-\)-]', '', period) #remove parentheses and dashes 87 | 88 | if "θ'περιοδος" in period: 89 | period = period.replace("θ'","θ' ") 90 | 91 | if 'προεδρευομενης κοινοβουλευτικης δημοκρατιας' in period: 92 | period = period.replace('προεδρευομενης κοινοβουλευτικης δημοκρατιας', 'presided-parliamentary-republic') 93 | 94 | period = period.split(' ') 95 | period_number = greek_numerals_to_numbers(period[0]) 96 | 97 | if 'αναθεωρητική' in period: 98 | review_number = greek_numerals_to_numbers(period[2]) 99 | new_period = 'period-'+period_number+'-review-'+review_number+'-'.join(period[4:]) 100 | else: 101 | new_period = 'period-'+period_number+'-'+'-'.join(period[2:]) 102 | 103 | else: 104 | new_period = '' 105 | 106 | #SESSION 107 | 108 | session = segments[3] 109 | 110 | if session!='': 111 | session = session.replace("γ'τμήμα", "γ' τμήμα") 112 | 113 | section = session.split(' ') 114 | if "'" in section[0]: 115 | session_number = greek_numerals_to_numbers(section[0]) 116 | 117 | if (re.search(r'\d', section[-1])): 118 | year = re.sub("[()]", '', section[-1]) 119 | 120 | if 'τμήμα διακοπής εργασιών βουλής θέρους' in session: 121 | new_session = year+'-summer-recess-section-'+session_number 122 | elif 'θέρο' in session: 123 | session = session.replace('θέρος', 'summer') 124 | session = session.replace('συνέχιση θέρους', 'continuation-of-summer-recess') 125 | new_session = session.replace(' ', '-') 126 | elif 'έκτακτη σύνοδος' in session: 127 | new_session = session.replace('έκτακτη σύνοδος', 'parliament-recall-extraordinary-session') 128 | elif 'συνέχιση ολομέλειας' in session: 129 | new_session = 'session-'+session_number+'-(continuation-of-plenary-session)' 130 | else: 131 | new_session = 'session'+'-'+session_number 132 | else: 133 | new_session = '' 134 | 135 | #SITTING 136 | 137 | sitting = segments[4] 138 | 139 | if sitting!='': 140 | if sitting=='ειδικη συνεδριαση ημερα της γυναικας': 141 | new_sitting = "special-sitting-international-women-'s-day" 142 | elif sitting=='ειδικη ημερησια διαταξη της ολομελειας της βουλης': 143 | new_sitting = 'a-special-agenda-for-the-plenary-session-of-the-parliament' 144 | elif sitting=='ειδικη εκδηλωση για την επετειο της γενοκτονιας των ποντιων στη βουλη': 145 | new_sitting = 'special-event-anniversary-of-Pontic-Greek-genocide' 146 | elif sitting=='βουλη των εφηβων': 147 | new_sitting = 'Youth-Parliament' 148 | else: 149 | sitting_number = greek_numerals_to_numbers(sitting) 150 | new_sitting = 'sitting-'+sitting_number 151 | else: 152 | new_sitting = '' 153 | 154 | ext = os.path.splitext(filename)[1] #initial file extension including dot 155 | 156 | #Compose new name without extension 157 | new_filename = part1+'_'+new_period+'_'+new_session+'_'+new_sitting+ext 158 | 159 | # copy and rename file to new location 160 | shutil.copy(os.path.join(datapath, filename), 161 | os.path.join(new_datapath, new_filename)) 162 | 163 | if ext.lower()!='.txt': 164 | command = 'java -jar tika-app-1.20.jar --text --encoding=utf-8 '+os.path.join(new_datapath, new_filename)\ 165 | +'>'+os.path.join(new_datapath, os.path.splitext(new_filename)[0])+'.txt' 166 | 167 | print(command) 168 | subprocess.call(command, shell=True) #shell=True hides console window 169 | 170 | # delete initial non-txt files and keep only converted files 171 | os.remove(os.path.join(new_datapath, new_filename)) 172 | else: 173 | print('File already in txt format.\n') 174 | 175 | renaming_log.write(b'Before: '+filename.encode("utf-8")+b'\nAfter: '+(os.path.splitext(new_filename)[0]+'.txt').encode("utf-8")+b'\n\n') -------------------------------------------------------------------------------- /src/greek_name_cases_wiki_crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import urllib.parse 4 | import time 5 | import json 6 | import codecs 7 | import re 8 | import os 9 | from pathlib import Path 10 | 11 | 12 | def concat_json_files(file_paths): 13 | 14 | ultimate_dict = {} 15 | for file in file_paths: 16 | with codecs.open(file, 'r', encoding='utf-8') as f: 17 | parsed = f.read() 18 | json_dict = json.loads(parsed) 19 | ultimate_dict = dict(ultimate_dict, **json_dict) 20 | 21 | return ultimate_dict 22 | 23 | 24 | def dict_to_file(dict_500, c): 25 | 26 | file_500 = os.path.join(dirpath, str(c)+'.json') 27 | if not os.path.exists(dirpath): 28 | os.makedirs(dirpath) 29 | with codecs.open(file_500, 'w', encoding='utf-8') as f: 30 | json.dump(dict_500, f, ensure_ascii=False, indent=4) 31 | 32 | return c 33 | 34 | 35 | def pad_dict_list(dict_list): 36 | 37 | max_length = 0 38 | for key in dict_list.keys(): 39 | max_length = max(max_length, len(dict_list[key])) 40 | 41 | for key in dict_list.keys(): 42 | case_length = len(dict_list[key]) 43 | if case_length < max_length: 44 | dict_list[key] += ' ' * (max_length - case_length) 45 | 46 | return dict_list 47 | 48 | 49 | def crawl_names(dict_500, domain_URL, url, c, listing_category): 50 | print('Crawling ', urllib.parse.urljoin(domain_URL, url)) 51 | response = requests.get(urllib.parse.urljoin(domain_URL, url)) 52 | html = response.text # .content -> response as bytes, .text-> response as unicode/string 53 | soup = BeautifulSoup(html, "html.parser") 54 | 55 | entries_list = soup.find('div', {'class': 'mw-category'}).find_all('li') 56 | 57 | next_page = soup.find('a', href=True, text='επόμενη σελίδα') 58 | 59 | for entry in entries_list: 60 | c+=1 61 | 62 | # print(entry) 63 | name = entry.text 64 | table_dict = { 65 | 'ενικός': 66 | {'ονομαστική': '', 'γενική': '', 67 | 'αιτιατική': '', 'κλητική': ''}, 68 | 'πληθυντικός': 69 | {'ονομαστική': '', 'γενική': '', 70 | 'αιτιατική': '', 'κλητική': ''}} 71 | 72 | # Get content of name's page 73 | name_url = entry.find('a')['href'] 74 | response = requests.get(urllib.parse.urljoin(domain_URL,name_url)) 75 | name_html = response.text 76 | name_soup = BeautifulSoup(name_html, "html.parser") 77 | 78 | if name_soup.find("div", {"class": "NavFrame"}): 79 | name_soup.find("div", {"class": "NavFrame"}).decompose() 80 | 81 | tables = [table for table in name_soup.find_all('tbody') if 82 | any(word in table.text.lower() for word in ['πτώσεις', 'πτώση'])] 83 | 84 | if len(tables) != 0: 85 | 86 | if len(tables) >1: 87 | print(name, ': tables more than 1\n'+urllib.parse.urljoin(domain_URL,name_url)) 88 | 89 | # Choose the right table 90 | li_tags = name_soup.find_all('li') 91 | for li_tag in li_tags: 92 | if listing_category in li_tag.text.lower(): 93 | print('Choosing \"', listing_category, '\" section.\n') 94 | table = li_tag.find_previous('tbody') 95 | break # execute following commands outside this for-loop 96 | 97 | if table != None: 98 | 99 | # if wrong table is found 100 | if not any(word in table.text.lower() for word in ['πτώσεις', 'πτώση']): 101 | print(name, ': previous table of li_tag is not the correct one.') 102 | dict_500[name] = table_dict 103 | continue # continue with next iteration of for-loop 104 | else: 105 | # if no table is found 106 | dict_500[name] = table_dict 107 | print(name, ': table == None') 108 | continue # continue with next iteration of for-loop 109 | 110 | else: 111 | # take only the first table 112 | table = tables[0] 113 | 114 | column_names = [th.text.lower().strip() for th in table.find_all('th')] 115 | # 0 for first row, 1 for notes 116 | trs = [tr for tr in table.find_all('tr') if len(tr.find_all('td')) > 1] 117 | 118 | # for each row 119 | for tr in trs: 120 | 121 | row_tds = tr.find_all('td') 122 | 123 | if len(row_tds) != 1: # discard notes at the bottom of tables 124 | 125 | # discard articles (grammar) that have center alignement 126 | row_values = [td.text.strip() for td in row_tds if td.attrs.get('align') != 'center'] 127 | 128 | # if each row has amount of cells equal or less than amount of columns 129 | if len(row_values) <= len(column_names): 130 | 131 | # for each column of the table 132 | for i in range(1,len(row_values)): 133 | value = row_values[i] 134 | # remove digits and * symbol 135 | value = re.sub(r'[\d+\*]', '', value) 136 | 137 | try: 138 | if 'δοτική' not in row_values[0].lower(): #discard δοτική 139 | if '&' in value: 140 | value = [v.strip() for v in value.split('&')] 141 | if ' και ' in value: 142 | value = [v.strip() for v in value.split(' και ')] 143 | 144 | table_dict[column_names[i]][row_values[0].lower()] = value 145 | except: 146 | print(name, column_names[i], row_values[0], 'KeyError') 147 | 148 | else: 149 | print(name, ': mismatch len(row_values) == len(column_names)') 150 | 151 | else: 152 | table_dict = { 153 | 'ενικός': 154 | {'ονομαστική': '', 'γενική': '', 'αιτιατική': '', 'κλητική': ''}, 155 | 'πληθυντικός': 156 | {'ονομαστική': '', 'γενική': '', 'αιτιατική': '', 'κλητική': ''}} 157 | 158 | dict_500[name] = table_dict 159 | 160 | 161 | if c%500==0: 162 | print(str(c)+' entries checkpoint...') 163 | c = dict_to_file(dict_500, c) 164 | dict_500 = {} 165 | 166 | if next_page != None: 167 | 168 | return(crawl_names(dict_500, domain_URL, next_page['href'], c, listing_category)) 169 | 170 | else: 171 | #write last dict_500 172 | c = dict_to_file(dict_500, c) 173 | 174 | return 175 | 176 | 177 | start_time = time.time() 178 | domain_URL = 'https://el.wiktionary.org' 179 | 180 | dirs_urls_dict = {'../out_files/wiki_data/male_name_cases': ['/wiki/Κατηγορία:Ανδρικά_ονόματα_(νέα_ελληνικά)', 'ανδρικό όνομα'], 181 | '../out_files/wiki_data/female_name_cases': ['/wiki/Κατηγορία:Γυναικεία_ονόματα_(νέα_ελληνικά)', 'γυναικείο όνομα'], 182 | '../out_files/wiki_data/male_surname_cases': ['/wiki/Κατηγορία:Ανδρικά_επώνυμα_(νέα_ελληνικά)', 'ανδρικό επώνυμο'], 183 | '../out_files/wiki_data/female_surname_cases': ['/wiki/Κατηγορία:Γυναικεία_επώνυμα_(νέα_ελληνικά)', 'γυναικείο επώνυμο'] 184 | } 185 | 186 | for dirpath, list in dirs_urls_dict.items(): 187 | dir_name = os.path.basename(dirpath) 188 | page_url = list[0] 189 | listing_category = list[1] 190 | c=0 191 | dict_500 = {} 192 | crawl_names(dict_500, domain_URL, page_url, c, listing_category) 193 | 194 | # sort by creation time 195 | file_paths = sorted(Path(dirpath).iterdir(), key=os.path.getctime) 196 | 197 | ultimate_dict = concat_json_files(file_paths) 198 | print('Collected '+ str(len(ultimate_dict.keys())) + ' entries in this category.') 199 | with codecs.open(os.path.join('../out_files/wiki_data/', dir_name+'.json'), 'w', encoding='utf-8') as f: 200 | json.dump(ultimate_dict, f, ensure_ascii=False, indent=4) 201 | 202 | diff = ((time.time() - start_time)/60) 203 | print("Lasted --- %s minutes ---" % str(diff)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Greek Parliament Proceedings, 1989-2020 (Dataset) 2 | This [dataset](https://zenodo.org/record/4311577#.X8-yMdgzaUk) is produced on behalf of [iMEdD](https://www.imedd.org/) by PhD Student in Machine Learning [Konstantina Dritsa](https://github.com/Dritsa-Konstantina), with the contribution of data journalist and [iMEdD Lab](https://lab.imedd.org/) Project Manager [Kelly Kiki](https://github.com/kellykiki). iMEdD (incubator for Media Education and Development) is a non-profit journalism organisation that supports and promotes transparency, credibility and independence in journalism. Lab is iMEdD’s content production division which publishes original interactive investigative and data-driven stories by experimenting with new forms and tools in journalism. 3 | 4 | This dataset is the next version of a [previous upload](https://zenodo.org/record/2587904#.X8-jl9gzaUk), which originated from the work implemented during the course of the Master thesis entitled "[Speech quality and sentiment analysis on the Hellenic Parliament proceedings](http://www.pyxida.aueb.gr/index.php?op=view_object&object_id=6387)" at the Athens University of Economics & Business in 2018 under the supervision of the Associate Professor Panagiotis Louridas. 5 | 6 | This dataset includes 1,280,918 speeches (rows) of Greek parliament members with a total volume of 2.30 GB, that were exported from 5,355 parliamentary sitting record files. They extend chronologically from early July 1989 up to late July 2020. The dataset consists of a .csv file in UTF-8 encoding and includes the following columns of data: 7 | - member_name: the official name of the parliament member who talked during a sitting. 8 | - sitting_date: the date that the sitting took place. 9 | - parliamentary_period: the name and/or number of the parliamentary period that the speech took place in. A parliamentary period includes multiple parliamentary sessions. 10 | - parliamentary_session: the name and/or number of the parliamentary session that the speech took place in. A parliamentary session includes multiple parliamentary sittings. 11 | - parliamentary_sitting: the name and/or number of the parliamentary sitting that the speech took place in. 12 | - political_party: the political party that the speaker belonged to the moment of their speech. 13 | - government: the government in force when the speech took place. 14 | - member_region: the electoral district the speaker belonged to. 15 | - roles: information about the parliamentary roles and/or government position of the speaker the moment of their speech. 16 | - member_gender: the sex of the speaker 17 | - speech: the speech that the member made during the parliamentary sitting 18 | 19 | The methodology followed for the production of this dataset is described in the iMEdD Lab's article entitled "[The creation of a dataset with the parliament proceedings within 31 years](https://devlab.imedd.org/i-dimiourgia-tou-dataset-me-ta-koinovouleftika-praktika/)". 20 | 21 | Scripts and relevant documentation are available here. 22 | 23 | #### Script order: 24 | 25 | #### Record Collection and Cleaning: 26 | 27 | 1. __web_crawler_for_proceeding_files.py:__ Download record files from https://www.hellenicparliament.gr/Praktika/Synedriaseis-Olomeleias to the original_data folder and change the filenames to match the template "recordDate_id_periodNo_sessionNo_sittingNo.ext". This script also logs the records that are missing from the website in the file rows_with_no_files.txt. 28 | 1. __convert2txt.py:__ Convert all types of downloaded record files (pdf, doc, docx) to text format with the use of tika-app-1.20.jar and translate the filenames from Greek to English. Save the converted files to the folder _data. 29 | 30 | #### Parliament Members Data Collection and Cleaning: 31 | 3. __web_crawler_for_parliament_members.py:__ Download information of the Parliament Members from the dropdown list at https://www.hellenicparliament.gr/Vouleftes/Diatelesantes-Vouleftes-Apo-Ti-Metapolitefsi-Os-Simera/. Write the output to the file original_parl_members_data.csv. 32 | 1. __parl_members_data_cleaner.py:__ Clean and format the file original_parl_members_data.csv for further use. Write the output to the file parl_members_activity_1989onwards.csv. 33 | 1. __add_gender_to_members.py:__ Add gender column to the parl_members_activity_1989onwards.csv with the use of the manually created files female_names_alternatives_gr.txt and male_names_alternatives_gr.txt (lists of first names and their alternatives) and create the file parl_members_activity_1989onwards_with_gender.csv. 34 | 35 | #### Government Members Data Collection and Cleaning: 36 | 37 | 7. __greek_name_cases_wiki_crawler.py:__ Crawl the wiktionary lists of modern Greek female/male names/surnames and additionally collect all the grammatical cases, when available in tables within each entry page. The output consists of 4 json files, namely female_name_cases.json, male_name_cases.json, female_surname_cases.json, male_surname_cases.json. 38 | 1. __produce_cases_from_nominative.py:__ Take as input the json files produced from the script greek_name_cases_wiki_crawler.py and produce the missing grammatical cases based on the nominative case. It produces 4 json files, namely female_name_cases_populated.json, male_name_cases_populated.json, female_surname_cases_populated.json, male_surname_cases_populated.json. 39 | 1. __web_crawler_for_government_members.py:__ Crawl the website https://gslegal.gov.gr/?page_id=776&sort=time and collect information about all the governments from 1989 up to 2020 and all the members that were assigned government roles. Create the files governments_1989onwards.csv (with information only about the governments and their start and end dates) and the original_gov_members_data.csv with the crawled raw data from the website. 40 | 1. __gov_members_data_cleaner.py:__ Clean the file original_gov_members_data.csv. Convert names and surnames from genitive to nominative case and add gender with the use of the files crawled from Wiktionary (male_name_cases_populated.json, female_name_cases_populated.json, male_surname_cases_populated.json). Make corrections in roles, member names, entries and create a file formatted_roles_gov_members_data.csv with columns: member_name, role, role_start_date, role_end_date, gender. 41 | 42 | #### Speech Extraction 43 | 44 | 11. __join_members_activity.py:__ Concatenate 3 files with information about the parliament members and extra parliamentary members. The input files are: parl_members_activity_1989onwards_with_gender.csv (includes elected parliament members), formatted_roles_gov_members_data.csv (includes all government members that have been assigned a government role but may not have been necessarily elected as parliament members) and extra_roles_manually_collected.csv (includes manually collected additional roles from Wikipedia such as Chairman of the Parliament, party leaders etc). An extra column is added with the name of the government during each member's activity, with the use of the file governments_1989onwards.csv. The final output of this script is the file all_members_activity.csv with columns: member_name, member_start_date, member_end_date, political_party, administrative_region, gender, roles, government_name. 45 | 1. __member_speech_matcher.py:__ Extract speeches from record files and match them to the official parliament or government member. After the detection of a speech in a record file with the use of regular expressions, we search the detected speaker in the file all_members_activity.csv. For the string comparison between names, the manually created file greek_names_alts_only.txt is used with a list of Greek names and their alternatives. The output file is the tell_all.csv with columns: member_name, sitting_date, parliamentary_period, parliamentary_session, parliamentary_sitting, political_party, government, member_region,roles, member_gender,speaker_info, speech. This script also creates the file files_with_content_problems.txt, where record files that are skipped due to encoding issues are logged. 46 | 1. __fill_proedr_names.py:__ Fill in the names of chairmen in the same sittings when they are not mentioned in a specific line of the record file, as long as the sitting has only one chairman. The output is the file tell_all_filled.csv. 47 | 1. __csv_concat.py:__ Additional script that concatenates all tell_all.csv files created in case the script member_speech_matcher.py is run in parallel for time-optimization and created different csv files for different batches of the data. The output of this script is the file tell_all_final.csv. 48 | 49 | 50 | #### Requirements 51 | - Libraries from requirements.txt 52 | - Python version 3.7.3 53 | - WebDriver for Chrome 54 | - tika-app-1.20.jar 55 | -------------------------------------------------------------------------------- /src/join_members_activity.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import numpy as np 4 | 5 | 6 | def remove_father_name(name): 7 | name_parts = name.split(' ') 8 | new_name = name_parts[0]+' ' + ' '.join(name_parts[2:]) 9 | return new_name 10 | 11 | 12 | def parl_name_formatting(df): 13 | 14 | d = {r'\bαντωνης\b': 'αντωνιος', 15 | r'\bαχιλλευς\b': 'αχιλλεας', 16 | r'\bγιωργος\b': 'γεωργιος', 17 | r'\bγιαννης\b': 'ιωαννης', 18 | } 19 | # regex = False exact full string match, 20 | # regex = True substrings replaced unless use \bstring\b in first parenthesis. 21 | # regex only in first parenthesis 22 | df['member_name_copy'] = df['member_name'].copy().replace(d, regex=True) 23 | 24 | return df 25 | 26 | 27 | def gov_name_formatting(df): 28 | 29 | d = {r'\bβυρωνας\b': 'βυρων', 30 | r'\bεμμανουηλ λουκακης\b': 'μανωλης λουκακης', 31 | r'\bκιμωνας κουλουρης\b': 'κιμων κουλουρης', 32 | r'\bκωνσταντινος σημιτης\b': 'κωστας σημιτης', 33 | r'\bμιχαηλ παπαδοπουλος\b': 'μιχαλης παπαδοπουλος', 34 | r'\bμιχαηλ παπακωνσταντινου\b': 'μιχαλης παπακωνσταντινου', 35 | r'\bμιχαηλ-γεωργιος λιαπης\b': 'μιχαλης γεωργιος λιαπης', 36 | r'\bνικολαος χριστοδουλακης\b': 'νικος χριστοδουλακης' 37 | } 38 | # regex = False exact full string match, 39 | # regex = True substrings replaced unless use \bstring\b in first parenthesis. 40 | # regex only in first parenthesis 41 | df['member_name'] = df['member_name'].replace(d, regex=True) 42 | 43 | return df 44 | 45 | 46 | def assert_filled_gender(df): 47 | 48 | if df['gender'].isnull().values.any()==True: 49 | print('Warning: some gender values ar NaN for the following member names...') 50 | print(df['member_name'][df['gender'].isnull()]) 51 | else: 52 | print('All names have assigned gender.') 53 | 54 | return 55 | 56 | 57 | def add_government_column(df, df_governments): 58 | 59 | # Convert to datetime type for best date comparisons 60 | df['government_name'] = [[] for _ in range(df.shape[0])] 61 | df['member_start_date'] = pd.to_datetime(df['member_start_date']) 62 | df['member_end_date'] = pd.to_datetime(df['member_end_date']) 63 | df_governments['date_from'] = pd.to_datetime(df_governments['date_from'])#.dt.date 64 | df_governments['date_to'] = pd.to_datetime(df_governments['date_to'])#.dt.date 65 | df_governments = df_governments.sort_values(by='date_from', ascending=True) 66 | 67 | # Drop rows before first government 68 | mask = (df['member_end_date'] >= df_governments.at[0,'date_from']) #1989-07-03 69 | df = df.loc[mask] 70 | 71 | for index1, row1 in df.iterrows(): 72 | matched_to_government = False 73 | for index2, row2 in df_governments.iterrows(): 74 | try: 75 | if (row1.member_start_date>=row2.date_from and 76 | row1.member_start_date=row2.date_from and 79 | row1.member_end_date=row2.date_to): 83 | 84 | item = row2['gov_name']+'('+str(row2.date_from.strftime('%d/%m/%Y'))+'-'+\ 85 | str(row2.date_to.strftime('%d/%m/%Y'))+')' 86 | df.at[index1,'government_name'].append(item) 87 | matched_to_government = True 88 | except: 89 | print('PROBLEM: cannot compare government dates and member dates') 90 | print(row1.member_start_date, type(row1.member_start_date)) 91 | print(row2.date_from, type(row2.date_from)) 92 | 93 | if matched_to_government == False: 94 | print('PROBLEM: not matched to existing government') 95 | print(row1) 96 | 97 | return df 98 | 99 | 100 | # FILE 1: elected parliament members from hellenicparliament.gr 101 | df_parl = pd.read_csv('../out_files/parl_members_activity_1989onwards_with_gender.csv', encoding='utf-8') 102 | df_parl = parl_name_formatting(df_parl) # add name copy column and proceed to adjustments 103 | # Remove father name because not all input files have it 104 | df_parl['member_name_copy'] = df_parl['member_name_copy'].apply(remove_father_name) 105 | df_parl['roles'] = [[] for _ in range(df_parl.shape[0])] 106 | 107 | # FILE 2: assigned government members with roles from gslegal.gov.gr 108 | df_gov = pd.read_csv('../out_files/formatted_roles_gov_members_data.csv', encoding='utf-8') 109 | df_gov = gov_name_formatting(df_gov) 110 | 111 | # FILE 3: manually extracted additional parliament roles 112 | df_extra = pd.read_csv('../out_files/extra_roles_manually_collected.csv', encoding='utf-8') 113 | # Remove father name because not all input files have it 114 | df_extra['member_name'] = df_extra['member_name'].apply(remove_father_name) 115 | 116 | # Concatenate members with roles 117 | df_roles = pd.concat([df_gov, df_extra]) 118 | 119 | members_to_match = list(set(df_roles['member_name'].to_list())) # unique member names 120 | 121 | # Match members from df_roles to df_parl BY NAME AND DATES and find extra parliamentary members 122 | df_roles['role_start_date'] = pd.to_datetime(df_roles['role_start_date']).dt.date 123 | df_roles['role_end_date'] = pd.to_datetime(df_roles['role_end_date']).dt.date 124 | df_parl['member_start_date'] = pd.to_datetime(df_parl['member_start_date']).dt.date 125 | df_parl['member_end_date'] = pd.to_datetime(df_parl['member_end_date']).dt.date 126 | 127 | extra_parliamentary = [] 128 | 129 | c = 0 130 | 131 | # Match the member names from df_roles to df_parl, in order to transfer their roles 132 | for index_gov, row_gov in df_roles.iterrows(): 133 | 134 | c += 1 135 | if c % 100 == 0: 136 | print(c) 137 | 138 | gov_matched_to_parl = False 139 | gov_name = row_gov['member_name'] 140 | gov_name = re.sub(r'[-()]', ' ', gov_name) 141 | gov_name = re.sub(r'\s\s+', ' ', gov_name) 142 | gov_parts = [i for i in gov_name.split(' ') if i != ''] 143 | 144 | # if we find a match, we don't break. continue iteration because it might 145 | # match to more than one periods as formed in the gov files 146 | for index_parl, row_parl in df_parl.iterrows(): 147 | parl_name = row_parl['member_name_copy'] 148 | parl_name = re.sub(r'[-()]', ' ', parl_name) 149 | parl_name = re.sub(r'\s\s+', ' ', parl_name) 150 | parl_parts = [i for i in parl_name.split(' ') if i != ''] 151 | 152 | # Check if gov_parts are all in parl_parts 153 | # meaning that full names match, regardless of word order 154 | check = all(item in parl_parts for item in gov_parts) 155 | 156 | # if names matched 157 | if check: 158 | m_s = row_parl['member_start_date'] 159 | m_e = row_parl['member_end_date'] 160 | r_s = row_gov['role_start_date'] 161 | r_e = row_gov['role_end_date'] 162 | 163 | # if any date of active role is in the member activity range 164 | # if role start in member activity, or role end in member activity 165 | # or activity in role of large range 166 | if (r_s>=m_s and r_s<=m_e) or (r_e>=m_s and r_e<=m_e) or (r_s<=m_s and r_e>=m_e): 167 | gov_matched_to_parl = True 168 | item = row_gov['role']+'('+str(row_gov['role_start_date'].strftime('%d/%m/%Y') 169 | )+'-'+str(row_gov['role_end_date'].strftime('%d/%m/%Y'))+')' 170 | df_parl.at[index_parl, 'roles'].append(item) 171 | 172 | # if df_roles name not in df_parl, it refers to extra parliamentary member 173 | if gov_matched_to_parl == False: 174 | role_item = row_gov['role']+'('+str(row_gov['role_start_date'].strftime('%d/%m/%Y') 175 | )+'-'+str(row_gov['role_end_date'].strftime('%d/%m/%Y'))+')' 176 | 177 | extra_parliamentary.append([row_gov['member_name'], row_gov['role_start_date']#.strftime('%d/%m/%Y'), 178 | , row_gov['role_end_date']#.strftime('%d/%m/%Y'), 179 | , 'εξωκοινοβουλευτικός', np.nan, row_gov['gender'], [role_item]]) 180 | 181 | del df_parl['member_name_copy'] 182 | 183 | df_parl = df_parl.append(pd.DataFrame(data=extra_parliamentary, 184 | columns=df_parl.columns), ignore_index=True) 185 | 186 | df_governments = pd.read_csv('../out_files/governments_1989onwards.csv', encoding='utf-8') 187 | df_parl = add_government_column(df_parl, df_governments) 188 | 189 | assert_filled_gender(df_parl) 190 | 191 | df_parl.to_csv('../out_files/all_members_activity.csv', encoding='utf-8', index=False) 192 | print('Created file all_members_activity.csv with columns ', df_parl.columns) 193 | -------------------------------------------------------------------------------- /out_files/files_with_content_problems.txt: -------------------------------------------------------------------------------- 1 | 1996-12-21_956_period-9-presided-parliamentary-republic_session-1_sitting-51.txt 2 | 1997-12-18_1159_period-9-presided-parliamentary-republic_session-2_sitting-53.txt 3 | 1997-12-19_1158_period-9-presided-parliamentary-republic_session-2_sitting-54.txt 4 | 1997-12-20_1157_period-9-presided-parliamentary-republic_session-2_sitting-55.txt 5 | 1997-12-21_1156_period-9-presided-parliamentary-republic_session-2_sitting-56.txt 6 | 1998-01-07_1155_period-9-presided-parliamentary-republic_session-2_sitting-57.txt 7 | 1998-01-08_1154_period-9-presided-parliamentary-republic_session-2_sitting-58.txt 8 | 1998-01-12_1172_period-9-presided-parliamentary-republic_session-2_sitting-60.txt 9 | 1998-01-13_1171_period-9-presided-parliamentary-republic_session-2_sitting-61.txt 10 | 1998-01-14_1170_period-9-presided-parliamentary-republic_session-2_sitting-62.txt 11 | 1998-01-15_1169_period-9-presided-parliamentary-republic_session-2_sitting-63.txt 12 | 1998-01-16_1168_period-9-presided-parliamentary-republic_session-2_sitting-64.txt 13 | 1998-01-19_1167_period-9-presided-parliamentary-republic_session-2_sitting-65.txt 14 | 1998-01-21_1165_period-9-presided-parliamentary-republic_session-2_sitting-67.txt 15 | 1998-01-22_1164_period-9-presided-parliamentary-republic_session-2_sitting-68.txt 16 | 1998-01-23_1183_period-9-presided-parliamentary-republic_session-2_sitting-69.txt 17 | 1998-01-26_1182_period-9-presided-parliamentary-republic_session-2_sitting-70.txt 18 | 1998-01-28_1180_period-9-presided-parliamentary-republic_session-2_sitting-72.txt 19 | 1998-01-29_1179_period-9-presided-parliamentary-republic_session-2_sitting-73.txt 20 | 1998-01-30_1178_period-9-presided-parliamentary-republic_session-2_sitting-74.txt 21 | 1998-02-02_1177_period-9-presided-parliamentary-republic_session-2_sitting-75.txt 22 | 1998-02-03_1176_period-9-presided-parliamentary-republic_session-2_sitting-76.txt 23 | 1998-02-04_1175_period-9-presided-parliamentary-republic_session-2_sitting-77.txt 24 | 1998-02-06_1193_period-9-presided-parliamentary-republic_session-2_sitting-79.txt 25 | 1998-02-09_1192_period-9-presided-parliamentary-republic_session-2_sitting-80.txt 26 | 1998-02-10_1191_period-9-presided-parliamentary-republic_session-2_sitting-81.txt 27 | 1998-02-11_1190_period-9-presided-parliamentary-republic_session-2_sitting-82.txt 28 | 1998-02-12_1189_period-9-presided-parliamentary-republic_session-2_sitting-83.txt 29 | 1998-02-16_1187_period-9-presided-parliamentary-republic_session-2_sitting-85.txt 30 | 1998-02-17_1186_period-9-presided-parliamentary-republic_session-2_sitting-86.txt 31 | 1998-02-19_1184_period-9-presided-parliamentary-republic_session-2_sitting-88.txt 32 | 1998-02-20_1203_period-9-presided-parliamentary-republic_session-2_sitting-89.txt 33 | 1998-02-23_1202_period-9-presided-parliamentary-republic_session-2_sitting-90.txt 34 | 1998-02-24_1201_period-9-presided-parliamentary-republic_session-2_sitting-91.txt 35 | 1998-02-25_1200_period-9-presided-parliamentary-republic_session-2_sitting-92.txt 36 | 1998-02-26_1199_period-9-presided-parliamentary-republic_session-2_sitting-93.txt 37 | 1998-03-04_1198_period-9-presided-parliamentary-republic_session-2_sitting-94.txt 38 | 1998-03-05_1197_period-9-presided-parliamentary-republic_session-2_sitting-95.txt 39 | 1998-03-06_1196_period-9-presided-parliamentary-republic_session-2_sitting-96.txt 40 | 1998-03-09_1195_period-9-presided-parliamentary-republic_session-2_sitting-97.txt 41 | 1998-03-10_1194_period-9-presided-parliamentary-republic_session-2_sitting-98.txt 42 | 1998-03-11_1213_period-9-presided-parliamentary-republic_session-2_sitting-99.txt 43 | 1998-03-12_1212_period-9-presided-parliamentary-republic_session-2_sitting-100.txt 44 | 1998-03-13_1211_period-9-presided-parliamentary-republic_session-2_sitting-101.txt 45 | 1998-03-16_1210_period-9-presided-parliamentary-republic_session-2_sitting-102.txt 46 | 1998-03-17_1209_period-9-presided-parliamentary-republic_session-2_sitting-103.txt 47 | 1998-03-18_1208_period-9-presided-parliamentary-republic_session-2_sitting-104.txt 48 | 1998-03-19_1207_period-9-presided-parliamentary-republic_session-2_sitting-105.txt 49 | 1998-03-20_1206_period-9-presided-parliamentary-republic_session-2_sitting-106.txt 50 | 1998-03-31_1204_period-9-presided-parliamentary-republic_session-2_sitting-108.txt 51 | 1998-07-14_1260_period-9-presided-parliamentary-republic_(summer-'98)_sitting-1.txt 52 | 1998-07-15_1259_period-9-presided-parliamentary-republic_(summer-'98)_sitting-2.txt 53 | 1998-07-16_1258_period-9-presided-parliamentary-republic_(summer-'98)_sitting-3.txt 54 | 1998-07-17_1257_period-9-presided-parliamentary-republic_(summer-'98)_sitting-4.txt 55 | 1998-07-21_1256_period-9-presided-parliamentary-republic_(summer-'98)_sitting-5.txt 56 | 1998-07-22_1255_period-9-presided-parliamentary-republic_(summer-'98)_sitting-6.txt 57 | 1998-07-23_1254_period-9-presided-parliamentary-republic_(summer-'98)_sitting-7.txt 58 | 1998-07-28_1273_period-9-presided-parliamentary-republic_(summer-'98)_sitting-8.txt 59 | 1998-07-29_1272_period-9-presided-parliamentary-republic_(summer-'98)_sitting-9.txt 60 | 1998-07-30_1271_period-9-presided-parliamentary-republic_(summer-'98)_sitting-10.txt 61 | 1998-08-04_1270_period-9-presided-parliamentary-republic_(summer-'98)_sitting-11.txt 62 | 1998-08-05_1268_period-9-presided-parliamentary-republic_(summer-'98)_sitting-12.txt 63 | 1998-08-05_1269_period-9-presided-parliamentary-republic_(summer-'98)_sitting-13.txt 64 | 1998-08-07_1267_period-9-presided-parliamentary-republic_(summer-'98)_sitting-14.txt 65 | 1998-08-11_1265_period-9-presided-parliamentary-republic_(summer-'98)_sitting-15.txt 66 | 1998-08-12_1264_period-9-presided-parliamentary-republic_(summer-'98)_sitting-16.txt 67 | 1998-08-13_1283_period-9-presided-parliamentary-republic_(summer-'98)_sitting-17.txt 68 | 1998-08-18_1282_period-9-presided-parliamentary-republic_(summer-'98)_sitting-18.txt 69 | 1998-08-19_1281_period-9-presided-parliamentary-republic_(summer-'98)_sitting-19.txt 70 | 1998-08-20_1280_period-9-presided-parliamentary-republic_(summer-'98)_sitting-20.txt 71 | 1998-08-25_1279_period-9-presided-parliamentary-republic_(summer-'98)_sitting-21.txt 72 | 1998-08-26_1278_period-9-presided-parliamentary-republic_(summer-'98)_sitting-22.txt 73 | 1998-08-27_1277_period-9-presided-parliamentary-republic_(summer-'98)_sitting-23.txt 74 | 1998-09-01_1276_period-9-presided-parliamentary-republic_(summer-'98)_sitting-24.txt 75 | 1998-09-02_1275_period-9-presided-parliamentary-republic_(summer-'98)_sitting-25.txt 76 | 1998-09-03_1274_period-9-presided-parliamentary-republic_(summer-'98)_sitting-26.txt 77 | 1998-09-08_1293_period-9-presided-parliamentary-republic_(summer-'98)_sitting-27.txt 78 | 1998-09-09_1292_period-9-presided-parliamentary-republic_(summer-'98)_sitting-28.txt 79 | 1998-09-10_1291_period-9-presided-parliamentary-republic_(summer-'98)_sitting-29.txt 80 | 1998-09-16_1289_period-9-presided-parliamentary-republic_(summer-'98)_sitting-31.txt 81 | 1998-09-17_1288_period-9-presided-parliamentary-republic_(summer-'98)_sitting-32.txt 82 | 1998-09-22_1287_period-9-presided-parliamentary-republic_(summer-'98)_sitting-33.txt 83 | 1998-09-24_1285_period-9-presided-parliamentary-republic_(summer-'98)_sitting-35.txt 84 | 1998-09-30_1303_period-9-presided-parliamentary-republic_(summer-'98)_sitting-37.txt 85 | 2000-07-27_1619_period-10-presided-parliamentary-republic_(summer-'00)_sitting-22.txt 86 | 2000-11-07_1672_period-10-presided-parliamentary-republic_session-1-(continuation-of-plenary-session)_sitting-48.txt 87 | 2000-11-22_1681_period-10-presided-parliamentary-republic_session-1-(continuation-of-plenary-session)_sitting-59.txt 88 | 2000-12-22_1699_period-10-presided-parliamentary-republic_session-1-(continuation-of-plenary-session)_sitting-81.txt 89 | 2001-08-21_1829_period-10-presided-parliamentary-republic_(summer-'01)_sitting-19.txt 90 | 2002-02-12_1943_period-10-presided-parliamentary-republic_session-2_sitting-87.txt 91 | 2002-02-14_1942_period-10-presided-parliamentary-republic_session-2_sitting-88.txt 92 | 2002-03-01_1935_period-10-presided-parliamentary-republic_session-2_sitting-98.txt 93 | 2002-04-03_1959_period-10-presided-parliamentary-republic_session-2_sitting-114.txt 94 | 2002-05-30_1990_period-10-presided-parliamentary-republic_session-2_sitting-142.txt 95 | 2002-06-03_1988_period-10-presided-parliamentary-republic_session-2_sitting-144.txt 96 | 2002-06-04_1987_period-10-presided-parliamentary-republic_session-2_sitting-145.txt 97 | 2002-06-05_1986_period-10-presided-parliamentary-republic_session-2_sitting-146.txt 98 | 2002-06-06_1985_period-10-presided-parliamentary-republic_session-2_sitting-147.txt 99 | 2002-06-10_2003_period-10-presided-parliamentary-republic_session-2_sitting-149.txt 100 | 2002-06-11_2002_period-10-presided-parliamentary-republic_session-2_sitting-150.txt 101 | 2002-06-12_2001_period-10-presided-parliamentary-republic_session-2_sitting-151.txt 102 | 2002-06-13_2000_period-10-presided-parliamentary-republic_session-2_sitting-152.txt 103 | 2002-06-17_1998_period-10-presided-parliamentary-republic_session-2_sitting-154.txt 104 | 2002-06-18_1997_period-10-presided-parliamentary-republic_session-2_sitting-155.txt 105 | 2002-06-19_1995_period-10-presided-parliamentary-republic_session-2_sitting-156.txt 106 | 2002-07-09_2009_period-10-presided-parliamentary-republic_(summer-'02)_sitting-5.txt 107 | 2002-07-10_2008_period-10-presided-parliamentary-republic_(summer-'02)_sitting-6.txt 108 | 2002-07-16_2006_period-10-presided-parliamentary-republic_(summer-'02)_sitting-8.txt 109 | 2002-07-18_2004_period-10-presided-parliamentary-republic_(summer-'02)_sitting-10.txt 110 | 2002-08-27_2030_period-10-presided-parliamentary-republic_(summer-'02)_sitting-23.txt 111 | 2002-09-11_2041_period-10-presided-parliamentary-republic_(summer-'02)_sitting-32.txt 112 | 2003-03-06_2142_period-10-presided-parliamentary-republic_session-3_sitting-84.txt 113 | 2003-05-06_2161_period-10-presided-parliamentary-republic_session-3_sitting-105.txt 114 | 2003-05-30_2167_period-10-presided-parliamentary-republic_session-3_sitting-119.txt 115 | 2003-07-04_2189_period-10-presided-parliamentary-republic_(summer-'03)_sitting-4.txt 116 | 2003-08-26_2212_period-10-presided-parliamentary-republic_(summer-'03)_sitting-21.txt 117 | 2003-09-09_2205_period-10-presided-parliamentary-republic_(summer-'03)_sitting-28.txt 118 | 2003-09-10_2204_period-10-presided-parliamentary-republic_(summer-'03)_sitting-29.txt 119 | 2006-04-04_2701_period-11-presided-parliamentary-republic_session-2_sitting-117.txt 120 | 2008-04-02_3064_period-12-presided-parliamentary-republic_session-1_sitting-116.txt 121 | 2008-09-15_3157_period-12-presided-parliamentary-republic_(summer-'08)_sitting-29.txt 122 | -------------------------------------------------------------------------------- /src/web_crawler_for_government_members.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import requests 3 | import pandas as pd 4 | from datetime import datetime as dt 5 | import datetime 6 | import re 7 | from bs4 import BeautifulSoup 8 | from bs4.element import Comment 9 | from collections import defaultdict 10 | 11 | 12 | def toDatetime(date): 13 | 14 | if date == 'ΣΗΜΕΡΑ': 15 | date = datetime.date.today() 16 | else: 17 | day, month, year = re.split(r'[\.\/-]', date) 18 | date = dt.strptime(year+'-'+month+'-'+day, '%Y-%m-%d') 19 | 20 | return date 21 | 22 | 23 | def month_to_number(month): 24 | 25 | months = {'ιανουαριου': '1', 'φεβρουαριου': '2', 'μαρτιου':'3', 26 | 'απριλιου': '4', 'μαιου': '5', 'ιουνιου': '6', 'ιουλιου': '7', 27 | 'αυγουστου': '8', 'σεπτεμβριου': '9', 'οκτωβριου': '10', 28 | 'νοεμβριου': '11', 'δεκεμβριου': '12'} 29 | 30 | return months[month] 31 | 32 | 33 | def get_date_and_type(event): 34 | 35 | event_parts = [e for e in re.split(r'[\s:\-\.]', event) if e != ''] 36 | day, month, year, event_type = event_parts 37 | if not month.isnumeric(): 38 | month = month_to_number(month) 39 | date = dt.strptime(year + '-' + month + '-' + day, '%Y-%m-%d') 40 | 41 | return date, event_type 42 | 43 | 44 | def remove_notes(content): 45 | 46 | notes_regex = re.compile(r'(ο\s*πρωθυπουργος)|(διορισθηκε)|(διορισθηκαν)') 47 | new_content = [] 48 | for item in content: 49 | if notes_regex.search(item): 50 | continue 51 | else: 52 | new_content.append(item) 53 | 54 | return new_content 55 | 56 | 57 | def tag_visible(element): 58 | 59 | if element.parent.name in ['style', 'script', 'head', 'title', 'meta', 60 | '[document]']: 61 | return False 62 | if isinstance(element, Comment): 63 | return False 64 | return True 65 | 66 | 67 | def soup_to_list(soup): 68 | 69 | texts = soup.find_all(text=True) 70 | visible_texts = list(filter(tag_visible, texts)) 71 | 72 | return[re.sub(r'\s{2,}', ' ', t) for t in visible_texts if t.strip()!=''] 73 | 74 | 75 | def df_from_gov_table(page_URL): 76 | 77 | response = requests.get(page_URL) 78 | html = response.text 79 | soup = BeautifulSoup(html, "html.parser") 80 | trs = soup.find("tbody").find_all('tr') 81 | trs = trs[1:] #skip header 82 | 83 | rows_list = [] 84 | 85 | for tr in trs: 86 | row = [td.text for td in tr.find_all('td')] 87 | row.append(tr.find('a').get('href')) 88 | rows_list.append(row) 89 | 90 | df_govs = pd.DataFrame(columns=['id', 'gov_name', 'date_from', 'date_to', 'gov_url'], data=rows_list) 91 | df_govs = df_govs.drop(['id'], axis=1) 92 | 93 | df_govs.date_from = df_govs.date_from.apply(lambda x: toDatetime(x)) 94 | df_govs.date_to = df_govs.date_to.apply(lambda x: toDatetime(x)) 95 | df_govs.gov_name = df_govs.gov_name.apply(lambda x: x.lower()) 96 | 97 | return df_govs 98 | 99 | 100 | def text_formatting(text): 101 | 102 | text = re.sub("['’`΄‘́̈]",'', text) 103 | text = re.sub('\t+' , ' ', text) 104 | text = text.lstrip() 105 | text = text.rstrip() 106 | text = re.sub('\s\s+' , ' ', text) 107 | text = re.sub('\s*(-|–)\s*' , '-', text) #fix dashes 108 | text = text.lower() 109 | text = text.translate(str.maketrans('άέόώήίϊΐiύϋΰ','αεοωηιιιιυυυ')) #remove accents 110 | text = text.translate(str.maketrans('akebyolruxtvhmnz','ακεβυολρυχτνημνζ')) #convert english characters to greek 111 | 112 | return text 113 | 114 | 115 | def balanced_parenthesis(myStr): 116 | stack = [] 117 | for char in myStr: 118 | if char == '(' or char == ')': 119 | stack.append(char) 120 | open = [char for char in stack if char == '('] 121 | close = [char for char in stack if char == ')'] 122 | if len(open) != len(close): 123 | return False 124 | else: 125 | return True 126 | 127 | 128 | def role_formatting(role): 129 | 130 | if role.startswith('του '): 131 | role = role.replace(role, role[4:]) 132 | 133 | return role 134 | 135 | 136 | # correct wrongly separated text 137 | def correct_separation(content): 138 | 139 | # Correct unbalanced parentheses 140 | correct_parenthesis = [] 141 | new_item = '' 142 | for c in content: 143 | 144 | new_item +=c 145 | if not balanced_parenthesis(new_item): 146 | continue 147 | else: 148 | correct_parenthesis.append(new_item) 149 | new_item = '' 150 | 151 | # Remove [1] or [ι] 152 | content = [c for c in correct_parenthesis if (c.strip()!='[1]' and c.strip()!='[ι]')] 153 | 154 | # Merge wrongly split strings 155 | merged_items = [] 156 | new_item = '' 157 | for c in content: 158 | # Add whitespace if previous string ends with digit 159 | if endswith_digits_regex.search(new_item): 160 | new_item = new_item + ' ' + c 161 | else: 162 | new_item += c 163 | 164 | if len(new_item) <= 3: 165 | continue 166 | else: 167 | merged_items.append(new_item) 168 | new_item = '' 169 | 170 | # Merge wrongly split date strings 171 | merged_dates = [] 172 | new_item = '' 173 | for c in merged_items: 174 | 175 | # Add whitespace if previous string ends with month 176 | if endswith_month_regex.search(c): 177 | new_item += c 178 | continue 179 | if endswith_month_regex.search(new_item): 180 | new_item += ' ' + c 181 | else: 182 | new_item += c 183 | merged_dates.append(new_item) 184 | new_item = '' 185 | 186 | # Specific problem due to unneeded span tags in tds 187 | merged_roles = [] 188 | for c in merged_dates: 189 | if c == 'και ενεργειας' and merged_roles[-1] == 'αναπληρωτη υπουργου παραγωγικης ανασυγκροτησης, περιβαλλοντος': 190 | merged_roles[-1] += ' '+c 191 | else: 192 | merged_roles.append(c) 193 | 194 | # Specific problem due to skipped td tags 195 | final = [] 196 | for c in merged_roles: 197 | if c == 'αλεξιου τσιπρα πρωθυπουργου': 198 | final.append('αλεξιου τσιπρα') 199 | final.append('πρωθυπουργου') 200 | else: 201 | final.append(c) 202 | 203 | return final 204 | 205 | 206 | # e.g. (...) or (π.δ...) or π.δ...(φεκ..) or φεκ.. 207 | def remove_presidential_decrees(content): 208 | decree_in_event = re.compile(r'\(π\.δ(.*?)\)\s*(:|\+)?$') 209 | parenthesis_start = re.compile(r'^\s*\(') 210 | parenthesis_end = re.compile(r'\).?\s*$') 211 | 212 | updated_content = [] 213 | for item in content: 214 | if parenthesis_start.search(item) and parenthesis_end.search(item): 215 | # do not keep item and continue to next 216 | continue 217 | # Specific case 218 | if item == ':π.δ': 219 | continue 220 | if ':' not in item and any(string in item for string in ['π.δ', 'φεκ']): 221 | continue 222 | if ':' in item and decree_in_event.search(item): 223 | item = decree_in_event.sub('', item) 224 | 225 | updated_content.append(item) 226 | 227 | return updated_content 228 | 229 | 230 | # e.g. 'απο 10.03.2004 μεχρι 19.9.2007', 'κυβερνηση λουκα δ. παπαδημου' 231 | def remove_gov_info(content): 232 | 233 | updated_content = [] 234 | 235 | for item in content: 236 | if any(word in item for word in ['απο ', 'μεχρι ', 'κυβερνηση ', 'υπηρεσιακη κυβερνηση']): 237 | continue 238 | else: 239 | updated_content.append(item) 240 | 241 | return updated_content 242 | 243 | 244 | def clean_up_soup(soup): 245 | 246 | # Remove not needed content. Cannot choose specific div because of malformed html 247 | soup.find("footer", {"class": "footer"}).decompose() 248 | soup.find("div", {"class": "comments"}).decompose() 249 | soup.find("header", {"class": "entry-header"}).decompose() 250 | soup.find("head").decompose() 251 | 252 | # Replace span tag with its contents to avoid unwanted text separation 253 | for match in soup.find_all('span'): 254 | match.unwrap() 255 | 256 | return soup 257 | 258 | 259 | def assert_correct_roles(df): 260 | flag = True 261 | accepted_roles = ['υπουργ', 'υφυπου', 'πρωθυπ', 'αναπλη', 'αντιπρ'] 262 | for index, row in df.iterrows(): 263 | role_initials = row.member_role[:6] 264 | if role_initials not in accepted_roles: 265 | flag = False 266 | print(role_initials) 267 | print(row) 268 | print('--------------------------------------------------------------------') 269 | 270 | return flag 271 | 272 | 273 | def correct_interwined_entries(members, roles): 274 | 275 | m1, m2, r1, r2 = None, None, None, None 276 | if members == 'γεωργιου ζανιαγεωργιου βερνικου' \ 277 | and roles == 'υπουργου οικονομικωνυφυπουργου ναυτιλιας': 278 | m1 = 'γεωργιου ζανια' 279 | m2 = 'γεωργιου βερνικου' 280 | r1 = 'υπουργου οικονομικων' 281 | r2 = 'υφυπουργου ναυτιλιας' 282 | 283 | if members == 'συμεων κεδικογλου (του βασιλειου)κωνσταντινου τσιαρα' \ 284 | and roles == 'υφυπουργου στον πρωθυπουργουφυπουργου εξωτερικων': 285 | m1 = 'συμεων κεδικογλου(του βασιλειου)' 286 | m2 = 'κωνσταντινου τσιαρα' 287 | r1 = 'υφυπουργου στον πρωθυπουργο' 288 | r2 = 'υφυπουργου εξωτερικων' 289 | 290 | return m1, m2, r1, r2 291 | 292 | page_URL = 'https://gslegal.gov.gr/?page_id=776&sort=time' 293 | df_govs = df_from_gov_table(page_URL) 294 | df_1989_onwards = df_govs[df_govs.date_to >= dt.strptime('1989-07-03', '%Y-%m-%d')] 295 | 296 | df_1989_onwards.to_csv('../out_files/governments_1989onwards.csv', header=True, index=False, encoding='utf-8') 297 | print('\nCreated file governments_1989onwards.csv\n') 298 | 299 | endswith_digits_regex = re.compile(r'\d+$') 300 | has_digit_regex = re.compile(r'\d+') 301 | 302 | #For specific data correction: υπηρεσιακη κυβερνηση βασιλικης σπ. θανου-χριστοφιλου "27 αυγουστου 2015" 303 | endswith_month_regex = re.compile(r'\s(ιανουαριου|φεβρουαριου|μαρτιου|απριλιου|μαιου|ιουνιου|ιουλιου|αυγουστου|' 304 | r'σεπτεμβριου|οκτωβριου|νομεβριου|δεκεμβριου)$') 305 | activity = defaultdict(list) 306 | all_cases = [] 307 | 308 | months = [] 309 | types = [] 310 | 311 | rows_list = [] 312 | 313 | for index, row in df_1989_onwards.iterrows(): 314 | print('Collecting data from government \"', row.gov_name, '\"') 315 | 316 | html = requests.get(row.gov_url+'&print=1').text 317 | html = html.replace(u'\xa0', ' ') 318 | html = html.replace(u' ', ' ') 319 | soup = BeautifulSoup(html, "html.parser") 320 | 321 | soup = clean_up_soup(soup) 322 | 323 | # Start formatting content 324 | content = soup_to_list(soup) 325 | 326 | content = [text_formatting(t) for t in content] 327 | content = correct_separation(content) 328 | 329 | # Specific correction for wrong data 330 | if row.gov_name == 'παπανδρεου ανδρεα': 331 | anaplirosi_index = content.index('αναπληρωσεις:') 332 | content = content[:anaplirosi_index] 333 | 334 | # Remove not needed information 335 | content = remove_presidential_decrees(content) 336 | content = remove_gov_info(content) 337 | content = remove_notes(content) 338 | content = [text_formatting(c) for c in content] 339 | 340 | # Create dict {event:[ [..,..], [..,..], ...]} 341 | events_per_gov = defaultdict(list) 342 | added_indexes = [] 343 | last_event = '' 344 | for index, item in enumerate(content): 345 | if index not in added_indexes: 346 | if any(string in item for string 347 | in [':','διορισμος', 'παραιτηση', 'παυση', 'απεβιωσε', 'αναπληρωση', 'αναπληρωσεις']): 348 | last_event = item 349 | else: 350 | if last_event != '' and index != len(content)-1: 351 | name = content[index] 352 | role = content[index+1] 353 | # specific correction for george papandreou government 354 | if role == 'θαλασσιων υποθεσεων, νησων και αλιειας': 355 | role = 'υφυπουργου θαλασσιων υποθεσεων, νησων και αλιειας' 356 | 357 | added_indexes.append(index+1) 358 | item = [name, role] 359 | events_per_gov[last_event].append(item) 360 | 361 | events_per_gov.pop('', None) 362 | 363 | # Create initial dataframe 364 | for event, names_roles in events_per_gov.items(): 365 | date, event_type = get_date_and_type(event) 366 | for name_role_pair in names_roles: 367 | member_name = name_role_pair[0] 368 | role = name_role_pair[1] 369 | 370 | if member_name in \ 371 | ['γεωργιου ζανιαγεωργιου βερνικου', 'συμεων κεδικογλου (του βασιλειου)κωνσταντινου τσιαρα']: 372 | 373 | # specific correction for mistakes in the data in samara's government 374 | m1,m2,r1,r2 = correct_interwined_entries(member_name, role) 375 | r1 = role_formatting(r1) 376 | r2 = role_formatting(r2) 377 | rows_list.append([date, event_type, m1, r1, row.date_from,row.date_to, row.gov_name]) 378 | rows_list.append([date, event_type, m2, r2, row.date_from,row.date_to, row.gov_name]) 379 | 380 | else: 381 | if not has_digit_regex.search(member_name+role): 382 | role = role_formatting(role) 383 | rows_list.append([date,event_type, member_name, role, row.date_from, row.date_to, row.gov_name]) 384 | 385 | 386 | df = pd.DataFrame(data = rows_list, columns=['date', 'event', 'member_name', 387 | 'member_role', 'gov_date_from', 'gov_date_to', 'gov_name']) 388 | 389 | # check if roles are correct 390 | if not assert_correct_roles(df): 391 | print('\nNot all entries comply with the proper role format.') 392 | else: 393 | print('\nAll entries have proper role format.') 394 | 395 | df.to_csv('../out_files/original_gov_members_data.csv', header=True, index=False, encoding='utf-8') 396 | print('\nCreated file original_gov_members_data.csv') -------------------------------------------------------------------------------- /out_files/extra_roles_manually_collected.csv: -------------------------------------------------------------------------------- 1 | member_name,role,role_start_date,role_end_date,gender 2 | αβδελας κωνσταντινου αποστολος,ζ αντιπροεδρος βουλης,18/07/2019,28/07/2020,male 3 | μαυρωτας παρασκευα γεωργιος,ζ αντιπροεδρος βουλης,06/05/2019,07/07/2019,male 4 | μειμαρακης ιωαννη ευαγγελος-βασιλειος,αρχηγος κομματος:προεδρος νεας δημοκρατιας,05/07/2015,23/11/2015,male 5 | μειμαρακης ιωαννη ευαγγελος-βασιλειος,ιε προεδρος βουλης,29/06/2012,06/02/2015,male 6 | μειμαρακης ιωαννη ευαγγελος-βασιλειος,αρχηγος αξιωματικης αντιπολιτευσης,05/07/2015,23/11/2015,male 7 | μητροπουλος παναγιωτη αλεξιος,α αντιπροεδρος βουλης,06/02/2015,04/10/2015,male 8 | μητσοτακης κυριακου κωνσταντινος,πρωθυπουργος,11/04/1990,13/10/1993,male 9 | μητσοτακης κυριακου κωνσταντινος,αρχηγος αξιωματικης αντιπολιτευσης,13/10/1993,03/11/1993,male 10 | μητσοτακης κυριακου κωνσταντινος,αρχηγος αξιωματικης αντιπολιτευσης,01/09/1984,12/10/1989,male 11 | μητσοτακης κυριακου κωνσταντινος,αρχηγος κομματος:προεδρος νεας δημοκρατιας,01/09/1984,03/11/1993,male 12 | μητσοτακης κωνσταντινου κυριακος,αρχηγος κομματος:προεδρος νεας δημοκρατιας,10/01/2016,28/07/2020,male 13 | μητσοτακης κωνσταντινου κυριακος,αρχηγος αξιωματικης αντιπολιτευσης,10/01/2016,08/07/2019,male 14 | μητσοτακης κωνσταντινου κυριακος,πρωθυπουργος,08/07/2019,28/07/2020,male 15 | μιχαλολιακος γεωργιου νικολαος,αρχηγος κομματος:γενικος γραμματεας χρυσης αυγης,01/12/1980,28/07/2020,male 16 | μπαλαφας περικλη ιωαννης,β αντιπροεδρος βουλης,06/02/2015,04/10/2015,male 17 | μπενακη-ψαρουδα ευαγγελου αννα,ια προεδρος βουλης,19/03/2004,27/09/2007,female 18 | μπενακη-ψαρουδα ευαγγελου αννα,δ αντιπροεδρος βουλης,21/04/2000,19/03/2004,female 19 | αποστολατος σπυριδωνα βαιτσης,ζ αντιπροεδρος,10/10/2008,14/10/2009,male 20 | αποστολατος σπυριδωνα βαιτσης,στ αντιπροεδρος βουλης,15/10/2009,17/05/2012,male 21 | αποστολιδης θωμα λουκας,γ αντιπροεδρος βουλης,08/10/1996,21/04/2000,male 22 | μπουρας κωνσταντινου αθανασιος,γ αντιπροεδρος βουλης,18/07/2019,28/07/2020,male 23 | μωραιτης παναγιωτη γεωργιος,ε αντιπροεδρος βουλης,04/07/1989,22/04/1990,male 24 | νακος σταυρου αθανασιος,β αντιπροεδρος βουλης,18/05/2012,28/03/2013,male 25 | νεραντζης δημητριου αναστασιος,γ αντιπροεδρος βουλης,27/09/2007,14/10/2009,male 26 | νικολαιδου αρισταρχου βαρβαρα (βερα),ε αντιπροεδρος βουλης,27/09/2007,17/05/2012,female 27 | νιωτης δημητριου γρηγοριος,α αντιπροεδρος βουλης,15/10/2009,17/05/2012,male 28 | ξαρχας λαζαρου αθανασιος,β αντιπροεδρος βουλης,22/04/1990,22/10/1993,male 29 | παγκαλος γεωργιου θεοδωρος,αντιπροεδρος κυβερνησης,06/10/2009,17/05/2012,male 30 | παντελακη αντωνιου ελπιδα,ζ αντιπροεδρος βουλης,18/05/2012,28/06/2012,female 31 | παπαδημητριου δημητριου ελισαβετ (ελσα),β αντιπροεδρος βουλης,27/09/2007,14/10/2009,female 32 | αργυρης ιωαννη ευαγγελος,γ αντιπροεδρος βουλης,15/10/2009,17/05/2012,male 33 | παπανδρεου ανδρεα γεωργιος,πρωθυπουργος,06/10/2009,11/11/2011,male 34 | παπανδρεου ανδρεα γεωργιος,αρχηγος αξιωματικης αντιπολιτευσης,10/03/2004,06/10/2009,male 35 | παπανδρεου ανδρεα γεωργιος,αρχηγος κομματος:προεδρος πανελληνιου σοσιαλιστικου κινηματος,08/02/2004,18/03/2012,male 36 | παπανδρεου γεωργιου ανδρεας,αρχηγος αξιωματικης αντιπολιτευσης,11/04/1990,13/10/1993,male 37 | παπανδρεου γεωργιου ανδρεας,αρχηγος αξιωματικης αντιπολιτευσης,12/10/1989,23/11/1989,male 38 | παπανδρεου γεωργιου ανδρεας,αρχηγος κομματος:προεδρος πανελληνιου σοσιαλιστικου κινηματος,03/09/1974,23/06/1996,male 39 | παπαρηγα νικολαου αλεξανδρα,αρχηγος κομματος:γενικος γραμματεας κομμουνιστικου κομματος ελλαδος,27/02/1991,14/04/2013,female 40 | παπουλιας γρηγοριου καρολος,προεδρος ελληνικης δημοκρατιας,12/03/2005,13/03/2015,male 41 | παραστατιδης σωκρατη θεοδωρος,αρχηγος κομματος:προεδρος ανεξαρτητων δημοκρατικων βουλευτων,23/11/2013,06/03/2014,male 42 | παυλοπουλος βασιλειου προκοπιος,προεδρος ελληνικης δημοκρατιας,13/03/2015,13/03/2020,male 43 | πετσαλνικος ιωαννη φιλιππος,δ αντιπροεδρος βουλης,19/03/2004,14/10/2009,male 44 | πετσαλνικος ιωαννη φιλιππος,ιγ προεδρος βουλης,15/10/2009,18/05/2012,male 45 | πικραμμενος οθωνος παναγιωτης,πρωθυπουργος,16/05/2012,20/06/2012,male 46 | πικραμμενος οθωνος παναγιωτης,αντιπροεδρος κυβερνησης,09/07/2019,28/07/2020,male 47 | πικραμμενος οθωνος παναγιωτης,αρχηγος κομματος:προεδρος νεας δημοκρατιας,29/11/2009,05/07/2015,male 48 | πλακιωτακης ιωσηφ ιωαννης,αρχηγος αξιωματικης αντιπολιτευσης,23/11/2015,10/01/2016,male 49 | πλακιωτακης ιωσηφ ιωαννης,αρχηγος κομματος:προεδρος νεας δημοκρατιας,23/11/2015,10/01/2016,male 50 | πολυδωρας γεωργιου βυρων,δ αντιπροεδρος βουλης,15/10/2009,17/05/2012,male 51 | πολυδωρας γεωργιου βυρων,ιδ προεδρος βουλης,18/05/2012,29/06/2012,male 52 | σακοραφα ηλια σοφια,η αντιπροεδρος βουλης,18/07/2019,28/07/2020,female 53 | σαμαρας κωνσταντινου αντωνιος,πρωθυπουργος,20/06/2012,26/01/2015,male 54 | σαμαρας κωνσταντινου αντωνιος,αρχηγος αξιωματικης αντιπολιτευσης,29/11/2009,16/05/2012,male 55 | σαμαρας κωνσταντινου αντωνιος,αρχηγος αξιωματικης αντιπολιτευσης,26/01/2015,05/07/2015,male 56 | σαμαρας κωνσταντινου αντωνιος,αρχηγος κομματος:προεδρος νεας δημοκρατιας,29/11/2009,05/07/2015,male 57 | σαμαρας κωνσταντινου αντωνιος,αρχηγος κομματος:προεδρος πολιτικης ανοιξης,30/06/1993,13/05/2004,male 58 | σγουριδης σγουρη παναγιωτης,β αντιπροεδρος βουλης,08/10/1996,19/03/2004,male 59 | σγουριδης σγουρη παναγιωτης,γ αντιπροεδρος βουλης,22/10/1993,08/10/1996,male 60 | σημιτης γεωργιου κωστας,αρχηγος κομματος:προεδρος πανελληνιου σοσιαλιστικου κινηματος,30/06/1996,08/02/2004,male 61 | σιουφας γεωργιου δημητριος,ιβ προεδρος βουλης,27/09/2007,15/10/2009,male 62 | σουρλας ιωαννη γεωργιος,α αντιπροεδρος βουλης,27/09/2007,14/10/2009,male 63 | σουρλας ιωαννη γεωργιος,β αντιπροεδρος βουλης,19/03/2004,27/09/2007,male 64 | σταυρογιαννης δημητριου νικολαος,αρχηγος κομματος:προεδρος ανεξαρτητων δημοκρατικων βουλευτων,06/03/2014,26/08/2014,male 65 | στεφανοπουλος δημητριου κωνσταντινος,προεδρος ελληνικης δημοκρατιας,10/03/1995,12/03/2005,male 66 | στεφανοπουλος δημητριου κωνσταντινος,αρχηγος κομματος:προεδρος δημοκρατικης ανανεωσης,06/12/1985,24/08/1996,male 67 | ταλιαδουρος αθανασιου σπυριδων,γ αντιπροεδρος βουλης,18/05/2012,28/06/2012,male 68 | τασουλας αναστασιου κωνσταντινος,ιη προεδρος βουλης,18/07/2019,28/07/2020,male 69 | τζαννετακης πετρου τζαννης,αντιπροεδρος κυβερνησης,11/04/1990,08/08/1991,male 70 | τζαννετακης πετρου τζαννης,αντιπροεδρος κυβερνησης,08/08/1991,13/10/1993,male 71 | τραγακης παναγιωτη ιωαννης,γ αντιπροεδρος βουλης,19/03/2004,27/09/2007,male 72 | τραγακης παναγιωτη ιωαννης,α αντιπροεδρος βουλης,29/06/2012,06/02/2015,male 73 | τσαλδαρης κωνσταντινου αθανασιος,ε προεδρος βουλης,04/07/1989,21/11/1989,male 74 | τσαλδαρης κωνσταντινου αθανασιος,στ προεδρος βουλης,21/11/1989,22/04/1990,male 75 | τσαλδαρης κωνσταντινου αθανασιος,ζ προεδρος βουλης,22/04/1990,22/10/1993,male 76 | τσιπρας παυλου αλεξιος,πρωθυπουργος,26/01/2015,27/08/2015,male 77 | τσιπρας παυλου αλεξιος,πρωθυπουργος,21/09/2015,08/07/2019,male 78 | τσιπρας παυλου αλεξιος,αρχηγος αξιωματικης αντιπολιτευσης,20/06/2012,26/01/2015,male 79 | τσιπρας παυλου αλεξιος,αρχηγος αξιωματικης αντιπολιτευσης,08/07/2019,28/07/2020,male 80 | τσιπρας παυλου αλεξιος,αρχηγος κομματος:προεδρος συνασπισμου αριστερας κινηματων και οικολογιας,10/02/2008,10/07/2013,male 81 | τσιπρας παυλου αλεξιος,αρχηγος κομματος:προεδρος συνασπισμου ριζοσπαστικης αριστερας,17/06/2012,28/07/2020,male 82 | τσοβολας κωνσταντινου δημητριος,αρχηγος κομματος:προεδρος δημοκρατικου κοινωνικου κινηματος,20/12/1995,20/03/2004,male 83 | φαρακος κωνσταντινου γρηγοριος,αρχηγος κομματος:γενικος γραμματεας κομμουνιστικου κομματος ελλαδος,11/07/1989,27/02/1991,male 84 | φλωρακης ιωαννη χαριλαος,αρχηγος κομματος:γενικος γραμματεας κομμουνιστικου κομματος ελλαδος,17/11/1974,11/07/1989,male 85 | φλωρακης ιωαννη χαριλαος,αρχηγος κομματος:προεδρος συνασπισμου αριστερας κινηματων και οικολογιας,08/04/1989,09/03/1990,male 86 | φραγκος αναστασιου δημητριος,γ αντιπροεδρος βουλης,22/04/1990,22/10/1993,male 87 | βαρεμενος βασιλειου γεωργιος,β αντιπροεδρος,04/10/2015,18/07/2019,male 88 | χαραλαμπιδου δημητριου δεσποινα,γ αντιπροεδρος βουλης,06/02/2015,04/10/2015,female 89 | χατζηγακης μιχαηλ σωτηριος,α αντιπροεδρος βουλης,19/03/2004,27/09/2007,male 90 | βαρουφακης γεωργιου γιανης,αρχηγος κομματος:προεδρος μετωπου ευρωπαικης ρεαλιστικης ανυπακοης,26/03/2018,28/07/2020,male 91 | χουντης βασιλειου νικολαος,αρχηγος κομματος:γενικος γραμματεας λαικης ενοτητας,09/06/2019,28/07/2020,male 92 | χριστοδουλοπουλου ζαχαρια αναστασια,γ αντιπροεδρος,04/10/2015,18/07/2019,female 93 | βελοπουλος ιωσηφ κυριακος,αρχηγος κομματος:προεδρος ελληνικης λυσης,28/06/2016,28/07/2020,male 94 | βενιζελος βασιλειου ευαγγελος,αντιπροεδρος κυβερνησης,17/06/2011,21/03/2012,male 95 | βενιζελος βασιλειου ευαγγελος,αντιπροεδρος κυβερνησης,25/06/2013,27/01/2015,male 96 | βενιζελος βασιλειου ευαγγελος,αρχηγος κομματος:προεδρος πανελληνιου σοσιαλιστικου κινηματος,18/03/2012,14/06/2015,male 97 | βενιζελος κυριακου νικητας,ε αντιπροεδρος βουλης,22/10/1993,08/10/1996,male 98 | βιτσας αθανασιου δημητριος,δ αντιπροεδρος βουλης,18/07/2019,28/07/2020,male 99 | βουτσης γεωργιου νικολαος,ιζ προεδρος βουλης,04/10/2015,18/07/2019,male 100 | βρεττος σπυριδωνος κωνσταντινος (ντινος),γ αντιπροεδρος βουλης,21/04/2000,19/03/2004,male 101 | γειτονας ιωαννη κωνσταντινος,α αντιπροεδρος βουλης,21/04/2000,19/03/2004,male 102 | γεννηματα γεωργιου φωτεινη (φωφη),αρχηγος κομματος:προεδρος πανελληνιου σοσιαλιστικου κινηματος,14/06/2015,28/07/2020,female 103 | γεννηματα γεωργιου φωτεινη (φωφη),αρχηγος κομματος:προεδρος δημοκρατικης συμπαραταξης,30/08/2015,16/03/2018,female 104 | γεννηματα γεωργιου φωτεινη (φωφη),αρχηγος κομματος:προεδρος κινηματος αλλαγης,17/11/2017,28/07/2020,female 105 | αθανασιου χριστοφα χαραλαμπος,β αντιπροεδρος βουλης,18/07/2019,28/07/2020,male 106 | γρηγορακος γεωργιου λεωνιδας,ε αντιπροεδρος βουλης,18/05/2012,24/06/2013,male 107 | δαμανακη θεοδωρου μαρια,α αντιπροεδρος βουλης,04/07/1989,21/11/1989,female 108 | δαμανακη θεοδωρου μαρια,β αντιπροεδρος βουλης,21/11/1989,22/04/1990,female 109 | δαμανακη θεοδωρου μαρια,αρχηγος κομματος:προεδρος συνασπισμου αριστερας κινηματων και οικολογιας,09/03/1990,19/12/1993,female 110 | αλαβανος νικολαου αλεξανδρος,αρχηγος κομματος:προεδρος συνασπισμου αριστερας κινηματων και οικολογιας,12/12/2004,10/02/2008,male 111 | αλαβανος νικολαου αλεξανδρος,αρχηγος κομματος:προεδρος συνασπισμου ριζοσπαστικης αριστερας,12/12/2004,04/10/2009,male 112 | δραγασακης ανδρεα ιωαννης,στ αντιπροεδρος,07/10/2008,14/10/2009,male 113 | δραγασακης ανδρεα ιωαννης,δ αντιπροεδρος βουλης,29/06/2012,06/02/2015,male 114 | δραγασακης ανδρεα ιωαννης,αντιπροεδρος κυβερνησης,23/09/2015,09/07/2019,male 115 | δραγασακης ανδρεα ιωαννης,αντιπροεδρος κυβερνησης,27/01/2015,28/08/2015,male 116 | δρεττακης γεωργιου εμμανουηλ,ε αντιπροεδρος βουλης,22/04/1990,22/10/1993,male 117 | δριβελεγκας κωνσταντινου ιωαννης,ε αντιπροεδρος,09/07/2013,05/02/2015,male 118 | εβερτ αγγελου μιλτιαδης,αρχηγος αξιωματικης αντιπολιτευσης,03/11/1993,21/03/1997,male 119 | εβερτ αγγελου μιλτιαδης,αρχηγος κομματος:προεδρος νεας δημοκρατιας,03/11/1993,21/03/1997,male 120 | ζακολικος χρηστου παυσανιας,β αντιπροεδρος βουλης,22/10/1993,08/10/1996,male 121 | ζηση αθανασιου ροδουλα,β αντιπροεδρος βουλης,15/10/2009,17/05/2012,female 122 | θεοδωρακης παναγιωτη σταυρος,αρχηγος κομματος:προεδρος ποταμι,26/02/2014,24/11/2019,male 123 | θεοχαροπουλος στεφανου αθανασιος,αρχηγος κομματος:προεδρος δημοκρατικης αριστερας,07/06/2015,28/07/2020,male 124 | κακλαμανης μιχαηλ νικητας,δ αντιπροεδρος βουλης,06/02/2015,18/07/2019,male 125 | κακλαμανης μιχαηλ νικητας,α αντιπροεδρος βουλης,18/07/2019,28/07/2020,male 126 | κακλαμανης χρηστου αποστολος,η προεδρος βουλης,22/10/1993,08/10/1996,male 127 | κακλαμανης χρηστου αποστολος,θ προεδρος βουλης,08/10/1996,21/04/2000,male 128 | κακλαμανης χρηστου αποστολος,ι προεδρος βουλης,21/04/2000,19/03/2004,male 129 | καλαντζης κωνσταντινου γεωργιος,β αντιπροεδρος,29/03/2013,05/02/2015,male 130 | καμμενος ηλια παναγιωτης (πανος),αρχηγος κομματος:προεδρος ανεξαρτητων ελληνων,24/02/2012,09/06/2019,male 131 | κανελλοπουλος πετρου αθανασιος,αντιπροεδρος κυβερνησης,11/04/1990,08/08/1991,male 132 | κανελλοπουλος πετρου αθανασιος,αντιπροεδρος κυβερνησης,08/08/1991,21/02/1992,male 133 | καραμανλης αλεξανδρου κωνσταντινος,πρωθυπουργος,10/03/2004,06/10/2009,male 134 | καραμανλης αλεξανδρου κωνσταντινος,πρωθυπουργος,19/09/2007,07/10/2009,male 135 | καραμανλης αλεξανδρου κωνσταντινος,αρχηγος αξιωματικης αντιπολιτευσης,06/10/2009,29/11/2009,male 136 | καραμανλης αλεξανδρου κωνσταντινος,αρχηγος αξιωματικης αντιπολιτευσης,21/03/1997,10/03/2004,male 137 | καραμανλης αλεξανδρου κωνσταντινος,αρχηγος κομματος:προεδρος νεας δημοκρατιας,21/03/1997,29/11/2009,male 138 | καρατζαφερης ιωαννη γεωργιος,αρχηγος κομματος:προεδρος λαικου ορθοδοξου συναγερμου,14/09/2000,27/05/2019,male 139 | κατσαρος ηλια νικος,α αντιπροεδρος βουλης,22/04/1990,22/10/1993,male 140 | κατσαρος ηλια νικος,γ αντιπροεδρος βουλης,21/11/1989,22/04/1990,male 141 | κατσαρος ηλια νικος,β αντιπροεδρος βουλης,04/07/1989,21/11/1989,male 142 | κατσαρος ηλια νικος,δ αντιπροεδρος βουλης,22/10/1993,21/04/2000,male 143 | αναγνωστοπουλος γεωργιου θεοδωρος,δ αντιπροεδρος βουλης,21/11/1989,22/04/1990,male 144 | αναγνωστοπουλος γεωργιου θεοδωρος,γ αντιπροεδρος βουλης,04/07/1989,21/11/1989,male 145 | αναγνωστοπουλος γεωργιου θεοδωρος,β αντιπροεδρος βουλης,22/04/1990,22/10/1993,male 146 | κολλια-τσαρουχα ευστρατιου μαρια,στ αντιπροεδρος βουλης,18/05/2012,05/02/2015,female 147 | κοσιωνης ανδρεα παναγιωτης,ε αντιπροεδρος βουλης,19/03/2000,27/09/2007,male 148 | κουβελης ευαγγελου φωτιος-φανουριος,αρχηγος κομματος:προεδρος δημοκρατικης αριστερας,10/07/2010,07/06/2015,male 149 | κουρακης στυλιανου αναστασιος (τασος),ζ αντιπροεδρος βουλης,17/06/2010,17/05/2012,male 150 | κουρακης στυλιανου αναστασιος (τασος),δ αντιπροεδρος βουλης,18/05/2012,28/06/2012,male 151 | κουρακης στυλιανου αναστασιος (τασος),α αντιπροεδρος βουλης,04/10/2015,18/07/2019,male 152 | κουτσουμπας αποστολου δημητριος,αρχηγος κομματος:γενικος γραμματεας κομμουνιστικου κομματος ελλαδος,14/04/2013,28/07/2020,male 153 | κρεμαστινος θωμα δημητριος,ε αντιπροεδρος βουλης,04/10/2015,18/07/2019,male 154 | κρητικος νικολαου παναγιωτης,α αντιπροεδρος βουλης,21/11/1989,22/04/1990,male 155 | κρητικος νικολαου παναγιωτης,α αντιπροεδρος βουλης,22/10/1993,21/04/2000,male 156 | κρητικος νικολαου παναγιωτης,δ αντιπροεδρος βουλης,03/07/1989,21/11/1989,male 157 | κρητικος νικολαου παναγιωτης,δ αντιπροεδρος βουλης,22/04/1990,22/10/1993,male 158 | κωνσταντινοπουλος κωνσταντινου οδυσσεας,ε αντιπροεδρος βουλης,18/07/2019,28/07/2020,male 159 | κωνσταντοπουλος ανδρεα νικολαος,αρχηγος κομματος:προεδρος συνασπισμου αριστερας κινηματων και οικολογιας,19/12/1993,12/12/2004,male 160 | κωνσταντοπουλος ανδρεα νικολαος,αρχηγος κομματος:προεδρος συνασπισμου ριζοσπαστικης αριστερας,07/03/2004,12/12/2004,male 161 | κωνσταντοπουλου ν. ζωη,ιστ προεδρος βουλης,06/02/2015,04/10/2015,female 162 | κωστοπουλος βασιλειου δημητριος,ε αντιπροεδρος βουλης,08/10/1996,21/04/2000,male 163 | λαμπρουλης αριστειδη γεωργιος,ζ αντιπροεδρος βουλης,05/02/2015,06/07/2016,male 164 | λαμπρουλης αριστειδη γεωργιος,στ αντιπροεδρος,04/10/2015,17/07/2019,male 165 | λαμπρουλης αριστειδη γεωργιος,στ αντιπροεδρος βουλης,06/07/2016,28/07/2020,male 166 | λαφαζανης γεωργιου παναγιωτης,αρχηγος κομματος:γενικος γραμματεας λαικης ενοτητας,21/08/2015,02/06/2019,male 167 | λεβεντης αποστολου βασιλης,αρχηγος κομματος:προεδρος ενωσης κεντρωων,02/03/1992,28/07/2020,male 168 | λεβεντης σωτηριου αθανασιος,ζ αντιπροεδρος,15/10/2009,16/06/2010,male 169 | λυκουδης διονυσιου σπυριδων,στ αντιπροεδρος βουλης,05/02/2015,04/10/2015,male 170 | λυκουδης διονυσιου σπυριδων,ζ αντιπροεδρος βουλης,06/07/2016,08/02/2019,male 171 | μαρκογιαννακης εμμανουηλ χρηστος,γ αντιπροεδρος βουλης,29/06/2012,05/02/2015,male 172 | -------------------------------------------------------------------------------- /src/parl_members_data_cleaner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from datetime import timedelta 4 | import pandas as pd 5 | import numpy as np 6 | 7 | def party_formatting(party): 8 | 9 | if party=='ΑΝΕΞΑΡΤΗΤΟΙΕΛΛΗΝΕΣΕΘΝΙΚΗΠΑΤΡΙΩΤΙΚΗΔΗΜΟΚΡΑΤΙΚΗΣΥΜΜΑΧΙΑ': 10 | party='ανεξαρτητοι ελληνες εθνικη πατριωτικη δημοκρατικη συμμαχια' 11 | elif party=='ΑΝΕΞΑΡΤΗΤΟΙΕΛΛΗΝΕΣ-ΠΑΝΟΣΚΑΜΜΕΝΟΣ': 12 | party='ανεξαρτητοι ελληνες - πανος καμμενος' 13 | elif party=='ΑΝΕΞΑΡΤΗΤΟΙ': 14 | party= 'ανεξαρτητοι (εκτος κομματος)' 15 | elif party=='ΣΥΝΑΣΠΙΣΜΟΣ': 16 | party='συνασπισμος της αριστερας των κινηματων και της οικολογιας' 17 | elif party=='ΑΝΕΞΑΡΤΗΤΟΙΔΗΜΟΚΡΑΤΙΚΟΙΒΟΥΛΕΥΤΕΣ': 18 | party='ανεξαρτητοι δημοκρατικοι βουλευτες' 19 | elif party=='ΠΟΛ.Α.': 20 | party='πολιτικη ανοιξη' 21 | elif party=='ΟΟ.ΕΟ.': 22 | party='οικολογοι εναλλακτικοι (ομοσπονδια οικολογικων εναλλακτικων οργανωσεων)' 23 | elif party=='ΔΗ.ΑΝΑ.': 24 | party= 'δημοκρατικη ανανεωση' 25 | elif party=='ΔΗ.Κ.ΚΙ.': 26 | party='δημοκρατικο κοινωνικο κινημα' 27 | elif party=='ΕΝΩΣΗΚΕΝΤΡΩΩΝ': 28 | party='ενωση κεντρωων' 29 | elif party== 'ΝΕΑΔΗΜΟΚΡΑΤΙΑ': 30 | party='νεα δημοκρατια' 31 | elif party=='ΛΑ.Ο.Σ.': 32 | party= 'λαικος ορθοδοξος συναγερμος' 33 | elif party=='ΛΑΪΚΟΣΣΥΝΔΕΣΜΟΣ-ΧΡΥΣΗΑΥΓΗ': 34 | party='λαικος συνδεσμος - χρυση αυγη' 35 | elif party=='ΚΟΜΜΟΥΝΙΣΤΙΚΟΚΟΜΜΑΕΛΛΑΔΑΣ': 36 | party='κομμουνιστικο κομμα ελλαδας' 37 | elif party=='Κ.Κ.Εσ': 38 | party='κομμουνιστικο κομμα ελλαδας εσωτερικου' 39 | elif party=='ΣΥΝΑΣΠΙΣΜΟΣΡΙΖΟΣΠΑΣΤΙΚΗΣΑΡΙΣΤΕΡΑΣ': 40 | party='συνασπισμος ριζοσπαστικης αριστερας' 41 | elif party=='ΛΑΪΚΗΕΝΟΤΗΤΑ': 42 | party='λαικη ενοτητα' 43 | elif party=='ΠΑ.ΣΟ.Κ.': 44 | party='πανελληνιο σοσιαλιστικο κινημα' 45 | elif party== 'ΔΗΜΟΚΡΑΤΙΚΗΑΡΙΣΤΕΡΑ': 46 | party='δημοκρατικη αριστερα' 47 | elif party=='ΔΗΜΟΚΡΑΤΙΚΗΣΥΜΠΑΡΑΤΑΞΗ(ΠΑΝΕΛΛΗΝΙΟΣΟΣΙΑΛΙΣΤΙΚΟΚΙΝΗΜΑ-ΔΗΜΟΚΡΑΤΙΚΗΑΡΙΣΤΕΡΑ)': 48 | party='δημοκρατικη συμπαραταξη (πανελληνιο σοσιαλιστικο κινημα - δημοκρατικη αριστερα)' 49 | elif party=='ΤΟΠΟΤΑΜΙ': 50 | party='το ποταμι' 51 | elif party=='ΕΝΩΣΗΚΕΝΤΡΟΥ-ΝΕΕΣΔΥΝΑΜΕΙΣΕΚ/ΝΔ': 52 | party='ενωση κεντρου - νεες δυναμεις (ε.κ. - ν.δ.)' 53 | elif party=='ΕΔΗΚ': 54 | party='ενωση δημοκρατικου κεντρου (εδηκ)' 55 | elif party=='ΕΘΝΙΚΗΠΑΡΑΤΑΞΙΣ': 56 | party='εθνική παράταξη' 57 | elif party=='ΕΘΝΙΚΗΠΑΡΑΤΑΞΙΣ': 58 | party='εθνικη παραταξη' 59 | elif party=='ΝΕΟΦΙΛΕΛΕΥΘΕΡΩΝ': 60 | party='κομμα νεοφιλελευθερων' 61 | elif party=='ΕΝΙΑΙΑΔΗΜΟΚΡΑΤΙΚΗΑΡΙΣΤΕΡΑ-Ε.Δ.Α.': 62 | party='ενιαια δημοκρατικη αριστερα (ε.δ.α.)' 63 | elif party=='ΣΥΜ/ΧΙΑΠΡ': 64 | party='συμμαχια προοδευτικων και αριστερων δυναμεων' 65 | elif party=='ΕΛΛΗΝΙΚΗΛΥΣΗ-ΚΥΡΙΑΚΟΣΒΕΛΟΠΟΥΛΟΣ': 66 | party='ελληνικη λυση - κυριακος βελοπουλος' 67 | elif party=='ΚΙΝΗΜΑΑΛΛΑΓΗΣ': 68 | party='κινημα αλλαγης' 69 | elif party=='ΜέΡΑ25': 70 | party='μετωπο ευρωπαικης ρεαλιστικης ανυπακοης (μερα25)' 71 | else: 72 | print('Party not matched to existing list ', party) 73 | return party 74 | 75 | def name_formatting(name): 76 | 77 | # when people have two surnames or two names, we glue them together with '-' in the middle 78 | name = re.sub(r"(\s*-\s*)|(\sή\s)",'-', name) 79 | name = name.translate(str.maketrans('άΆέΈόΌώΏήΉίΊϊΐύΎϋΰ','αΑεΕοΟωΩηΗιΙιιυΥυυ')) #remove accents 80 | name = re.sub(r"\t+" , " ", name) #replace tabs with space 81 | name = re.sub(r"΄", "", name) # replace accents with empty string 82 | name = re.sub(r"\s\s+" , " ", name) #replace more than one spaces with one space 83 | name = re.sub(r"(συζ.\s)",'συζ.', name) #remove space between συζ. and the name of the husband 84 | name = re.sub("μαρια γλυκερια",'μαρια-γλυκερια', name) 85 | name = re.sub("σουκουλη-βιλιαλη δημητριου μαρια ελενη \(μαριλενα\)",'σουκουλη-βιλιαλη δημητριου μαρια-ελενη (μαριλενα)', name) 86 | name = re.sub("χατζη χαβουζ γκαληπ",'χατζη-χαβουζ-γκαληπ', name) 87 | name = re.sub("μακρη θεοδωρου",'μακρη-θεοδωρου', name) 88 | name = re.sub("καρα γιουσουφ",'καρα-γιουσουφ', name) 89 | name = re.sub('χατζη οσμαν','χατζη-οσμαν', name) 90 | name = re.sub("σαδικ αμετ αμετ σαδηκ",'σαδικ αμετ αμετ', name) 91 | name = re.sub('ακριτα χα λουκη συλβα-καιτη','ακριτα συζ.λουκη συλβα-καιτη', name) 92 | name = re.sub('ιωανννης','ιωαννης', name) 93 | 94 | # specific correction missing father's name 95 | if name == 'βαγενα-κηλαηδονη αννα': 96 | name = 'βαγενα-κηλαηδονη γεωργιου αννα' 97 | if name == 'μονογυιου αικατερινη': 98 | name = 'μονογυιου χχχχχχχ αικατερινη' 99 | if name == 'ληναιος-μητυλιναιος γεωργιου (στεφανος)-διονυσιος': 100 | name = 'ληναιος-μητυληναιος γεωργιου στεφανος (διονυσιος)' 101 | if name == 'βεττα καλλιοπη': 102 | name = 'βεττα χχχχχχχ καλλιοπη' 103 | 104 | name = name.rstrip() #remove trailing space from string 105 | 106 | return name 107 | 108 | def region_formatting(region): 109 | 110 | region = (region.lower()).translate(str.maketrans('άέόώήίϊΐiύϋΰ','αεοωηιιιιυυυ')) 111 | region = re.sub(r"\t+" , " ", region) 112 | region = re.sub(r"΄", " ", region) 113 | region = re.sub(r"\s\s+" , " ", region) 114 | region = region.rstrip() 115 | 116 | if region=="α'θεσσαλονικης": 117 | region= "α' θεσσαλονικης" 118 | elif region=="α'αθηνων": 119 | region="α' αθηνων" 120 | elif region == "β'θεσσαλονικης": 121 | region = "β' θεσσαλονικης" 122 | elif region == "β'αθηνων": 123 | region = "β' αθηνων" 124 | elif region == "β2'δυτικουτομεααθηνων": 125 | region = "β2' δυτικου τομεα αθηνων" 126 | elif region == "α ανατολικησαττικης": 127 | region = "α' ανατολικης αττικης" 128 | elif region == "α'πειραιως": 129 | region = "α' πειραιως" 130 | elif region == "β'πειραιως": 131 | region = "β' πειραιως" 132 | elif region == "β3 νοτιουτομεααθηνων": 133 | region = "β3' νοτιου τομεα αθηνων" 134 | elif region == "β δυτικησαττικης": 135 | region = "β' δυτικης αττικης" 136 | elif region == "β1'βορειουτομεααθηνων": 137 | region = "β1' βορειου τομεα αθηνων" 138 | 139 | return region 140 | 141 | pd.set_option('display.max_columns', None) 142 | pd.set_option('display.max_rows', None) 143 | 144 | df = pd.read_csv('../out_files/original_parl_members_data.csv', encoding='utf-8', header=None, 145 | names = ['no', 'member_name', 'period_date_range', 'event_date', 146 | 'administrative_region', 'political_party', 'event_description']) 147 | 148 | # remove lines that contain "NO DATA" 149 | df = df[~df['period_date_range'].str.contains("NO DATA")] 150 | 151 | # remove characters from period 152 | df['period_date_range'] = df['period_date_range'].\ 153 | str.replace(r"[a-zA-Zα-ωΑ-Ω΄':()]", '') 154 | 155 | # split dates of period start & end and create new columns 156 | dates = df['period_date_range'].str.split('-', n=1, expand=True) 157 | 158 | df['period_start_date'] = dates[0] 159 | df['period_start_date'] = pd.to_datetime(df['period_start_date'], 160 | format='%d/%m/%Y') 161 | df['period_end_date'] = dates[1] 162 | df['period_end_date'] = pd.to_datetime(df['period_end_date'], 163 | format='%d/%m/%Y') 164 | 165 | # drop old column 166 | df.drop(columns=['period_date_range'], inplace=True) 167 | 168 | # keep periods that end from 1989 onwards, matching the available proceedings 169 | df = df[(df['period_end_date'].dt.year >= 1989)] 170 | 171 | # replace not needed strings in data cells 172 | df['member_name'] = df['member_name'].str.replace("Name:", '') 173 | df['event_date'] = df['event_date'].str.replace("Date:", '') 174 | df['administrative_region'] = df['administrative_region'].str.replace("Administrative-Region:", '') 175 | df['event_date'] = pd.to_datetime(df['event_date'], format='%d/%m/%Y') 176 | df['political_party'] = df['political_party'].str.replace( 177 | "Parliamentary-Party:", '') 178 | df['event_description'] = df['event_description'].str.replace("Description:",'') 179 | 180 | # Format political party information 181 | df['political_party'] = df['political_party'].apply(party_formatting) 182 | 183 | df['administrative_region'] = df['administrative_region'].apply(region_formatting) 184 | 185 | new_dfrows_list = [] 186 | 187 | 188 | ''' A member starts their parliamentary term with any of the following events, 189 | either by being elected or by replacing someone that left 190 | Thus, start_cases can only be the first of the events for each member ''' 191 | start_cases = ['aντικατέστησε', #e.g. aντικατέστησε:δούρουειρήνη(ρένα)αθανασίου(λόγω:παραίτησηςαπότοβουλευτικόαξίωμα) 192 | 'εκλογής', 193 | ] 194 | 195 | ''' A member ends their parliamentary term with any of the following events, 196 | resignation, passing away, by losing their position or by being murderded ''' 197 | end_cases = ['παραίτησηςαπότοβουλευτικόαξίωμα', 198 | 'απεβίωσε', 199 | 'έκπτωσηςβουλευτικούαξιώματος', 200 | 'δολοφονήθηκε', 201 | ] 202 | 203 | ''' A member can change their parliamentary position/party with any of the following events, 204 | change of party or independence from all parties ''' 205 | change_party_cases = ['προσχώρησης/επανένταξης', # change of party 206 | 'προσχώρηση', # change of party 207 | 'ανεξαρτητοποίηση', # independent (outside a party) 208 | 'προσχωρησηστηνκ.ο.τησνεασδημοκρατιας', # change of party 209 | 'ετέθηεκτός', # independent due to expulsion (outside a party) 210 | 'διεγράφη', # independent due to deletion (outside a party) 211 | ] 212 | 213 | error_cases = [] 214 | 215 | # For each parliamentary period, for each member in a period 216 | for id, subdf in df.groupby(['no','period_start_date']): #no is a unique id given to each member 217 | 218 | # Remove specific error in data for 'πλακιωτάκης ιωσήφ ιωάννης' 219 | if subdf.member_name.iloc[0] == 'Πλακιωτάκης Ιωσήφ Ιωάννης' and \ 220 | id[-1] == pd.Timestamp('2015-09-20 00:00:00'): 221 | subdf = subdf[subdf['event_date'] != pd.to_datetime('2019-07-07')] 222 | 223 | rows_num = subdf.shape[0] 224 | 225 | member_name = name_formatting((subdf.iloc[0]['member_name']).lower()) 226 | end_follows = False # refers to change of political party or any of the end cases 227 | change_follows = False # refers to change of political party 228 | 229 | ''' If member has only one event in this period, it should be a start case. In this case, 230 | the parliamentary term rolled smoothly with no interruptions or changes of political party''' 231 | if rows_num==1: 232 | 233 | if any((subdf.iloc[0]['event_description'].lower()).startswith(s) for s in start_cases): 234 | 235 | member_start_date = subdf.iloc[0]['event_date'] 236 | member_end_date = subdf.iloc[0]['period_end_date'] 237 | political_party = subdf.iloc[0]['political_party'] 238 | administrative_region = subdf.iloc[0]['administrative_region'] 239 | 240 | new_dfrows_list.append({'member_name': member_name, 241 | 'member_start_date': member_start_date, 242 | 'member_end_date': member_end_date, 243 | 'political_party': political_party, 244 | 'administrative_region': administrative_region, 245 | }) 246 | else: 247 | print('Probably missing data of case '+str(id)+', '+ 248 | str(subdf.iloc[0]['member_name'])) 249 | print() 250 | 251 | else: 252 | ''' If the parliamentary term of the member involves changes of party or end events, 253 | we iterate through rows inversely over time''' 254 | for i in range(rows_num-1,-1,-1): # e.g. 4 rows iterates from index 3 to 0 255 | 256 | ''' End events. In this case a start or change event must precede, 257 | from which we will take the start date of the term''' 258 | if any((subdf.iloc[i]['event_description'].lower()).startswith(e) for e in end_cases): 259 | 260 | member_end_date = subdf.iloc[i]['event_date'] 261 | last_event_date = subdf.iloc[i]['event_date'] 262 | last_political_party = subdf.iloc[i]['political_party'] 263 | end_follows = True 264 | change_follows = False 265 | 266 | # Change events 267 | elif any(p in subdf.iloc[i]['event_description'].lower() for p in change_party_cases): 268 | 269 | if end_follows: 270 | member_end_date = last_event_date 271 | elif change_follows: 272 | member_end_date = last_event_date - timedelta(days=1) 273 | else: 274 | member_end_date = subdf.iloc[i]['period_end_date'] 275 | 276 | member_start_date = subdf.iloc[i]['event_date'] 277 | political_party = subdf.iloc[i]['political_party'] 278 | administrative_region = subdf.iloc[i]['administrative_region'] 279 | 280 | new_dfrows_list.append({'member_name': member_name, 281 | 'member_start_date': member_start_date, 282 | 'member_end_date': member_end_date, 283 | 'political_party': political_party, 284 | 'administrative_region': administrative_region, 285 | }) 286 | 287 | if end_follows: 288 | if last_political_party != political_party: 289 | error_cases.append(subdf.iloc[i]['event_description'].lower()) 290 | print(subdf.iloc[i]['no'].lower()) 291 | 292 | # Update last event 293 | last_event_date = subdf.iloc[i]['event_date'] 294 | last_political_party = subdf.iloc[i]['political_party'] 295 | end_follows = False 296 | change_follows = True 297 | 298 | # Start events 299 | elif any((subdf.iloc[i]['event_description'].lower()).startswith(s) for s in start_cases): 300 | member_start_date = subdf.iloc[i]['event_date'] 301 | administrative_region = subdf.iloc[i]['administrative_region'] 302 | 303 | 304 | if end_follows: 305 | ''' political party and member_end_date have been declared 306 | member_end_date is filled if end event follows ''' 307 | new_dfrows_list.append({'member_name': member_name, 308 | 'member_start_date': member_start_date, 309 | 'member_end_date': member_end_date, 310 | 'political_party': last_political_party, 311 | 'administrative_region': administrative_region, 312 | }) 313 | elif change_follows: 314 | member_end_date = last_event_date - timedelta(days=1) 315 | political_party = subdf.iloc[i]['political_party'] 316 | new_dfrows_list.append({'member_name': member_name, 317 | 'member_start_date': member_start_date, 318 | 'member_end_date': member_end_date, 319 | 'political_party': political_party, 320 | 'administrative_region': administrative_region, 321 | }) 322 | 323 | # if nothing follows like the case of Διαμαντίδης Δημήτριος 324 | else: 325 | member_end_date = subdf.iloc[i]['period_end_date'] 326 | political_party = subdf.iloc[i]['political_party'] 327 | new_dfrows_list.append({'member_name': member_name, 328 | 'member_start_date': member_start_date, 329 | 'member_end_date': member_end_date, 330 | 'political_party': political_party, 331 | 'administrative_region': administrative_region, 332 | }) 333 | 334 | print('Check case for '+str(subdf.iloc[i]['member_name'])+ 335 | ' around date '+str(subdf.iloc[i]['period_end_date'])) 336 | 337 | 338 | new_df = pd.DataFrame(new_dfrows_list, columns=['member_name', 'member_start_date', 339 | 'member_end_date', 'political_party', 340 | 'administrative_region', 341 | ]) 342 | 343 | # drop activity that ends before 1/1/1989 344 | new_df = new_df[(new_df['member_end_date'].dt.year >= 1989)] 345 | 346 | # replace start dates before 1989 with 1/1/1989 347 | new_df['member_start_date'] = np.where(new_df['member_start_date'] < '1989-01-01', 348 | pd.to_datetime(['1989-01-01']), 349 | new_df['member_start_date']) 350 | 351 | new_df.to_csv('../out_files/parl_members_activity_1989onwards.csv', header=True, index=False, encoding='utf-8') 352 | -------------------------------------------------------------------------------- /src/member_speech_matcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import re 4 | import jellyfish 5 | from collections import defaultdict 6 | import csv 7 | import numpy as np 8 | from datetime import datetime as dt 9 | from argparse import ArgumentParser 10 | import pandas as pd 11 | import ast 12 | 13 | 14 | '''This script extracts speeches from record files and matches them to the 15 | official parliament or government member from the file all_members_activity.csv. 16 | The script takes as arguments from the command line 17 | 1) The path of the folder with the record files 18 | 2) The path to the folder where it outputs the speeches and the corresponding speakers 19 | Example: python member_speech_matcher.py -f '../path/to/data/folder/' -o '../output/folder/tell_all.csv' 20 | ''' 21 | 22 | 23 | starttime = dt.now() 24 | 25 | 26 | #Cleaning and formatting speakers data 27 | def text_formatting(text): 28 | text = re.sub("[():'’`΄‘]",' ', text) 29 | text = re.sub('\t+' , ' ', text) #replace one or more tabs with one space 30 | text = text.lstrip() #remove leading spaces 31 | text = text.rstrip() #remove trailing spaces 32 | text = re.sub('\s\s+' , ' ', text) #replace more than one spaces with one space 33 | text = re.sub('\s*(-|–)\s*' , '-', text) #fix dashes 34 | text = text.lower() 35 | text = text.translate(str.maketrans('άέόώήίϊΐiύϋΰ','αεοωηιιιιυυυ')) #remove accents 36 | text = text.translate(str.maketrans('akebyolruxtvhmnz','ακεβυολρυχτνημνζ')) #convert english chars to greek 37 | return text 38 | 39 | 40 | def speaker_name_corrections(name): 41 | if 'γενηματα' in name: 42 | name = name.replace('γενηματα', 'γεννηματα') 43 | if 'βαρουφακης' in name: 44 | name = name.replace('γιαννης', 'γιανης') 45 | if 'ζουραρις' in name: 46 | name = name.replace('ζουραρις','ζουραρης') 47 | return name 48 | 49 | 50 | # for example ΠΟΛΛΟΙ ΒΟΥΛΕΥΤΕΣ (από την πτέρυγα του ΠΑ.ΣΟ.Κ.):... 51 | def party_of_generic_reference(speaker): 52 | 53 | if 'πασοκ' in speaker: 54 | party = 'πανελληνιο σοσιαλιστικο κινημα' 55 | elif 'δημοκρατια' in speaker: 56 | party = 'νεα δημοκρατια' 57 | elif'συνασπισμου' in speaker: 58 | party = 'συνασπισμος της αριστερας των κινηματων και της οικολογιας' 59 | elif 'λαος' in speaker: 60 | party = 'λαικος ορθοδοξος συναγερμος' 61 | elif 'συριζα' in speaker: 62 | party = 'συνασπισμος ριζοσπαστικης αριστερας' 63 | elif 'αντιπολιτευσ' in speaker: 64 | party = 'αντιπολιτευση' 65 | else: 66 | party = 'βουλη' 67 | return party 68 | 69 | 70 | #For example ΦΩΤΕΙΝΗ (ΦΩΦΗ ΓΕΝΝΗΜΑΤΑ (Πρόεδρος της Δημοκρατικής Συμπαράταξης ΠΑΣΟΚ - ΔΗΜΑΡ):,2017 71 | def separate_nickname_incomplete_parenthesis(speaker, speaker_nickname): 72 | lefts = 0 73 | rights = 0 74 | if left_parenthesis_regex.search(speaker): 75 | lefts = len(re.findall(left_parenthesis_regex, speaker)) 76 | if right_parenthesis_regex.search(speaker): 77 | rights = len(re.findall(right_parenthesis_regex, speaker)) 78 | if (lefts-rights) > 0: 79 | if incomplete_nickname_parenthesis.search(speaker): 80 | # Keep separately the nickname of the speaker 81 | speaker_nickname = (incomplete_nickname_parenthesis.search(speaker)).group() 82 | speaker_nickname = text_formatting(speaker_nickname) 83 | speaker = re.sub(incomplete_nickname_parenthesis, '', speaker) #remove nickname 84 | return speaker, speaker_nickname 85 | 86 | 87 | # Keep separately the nickname of the speaker 88 | def separate_nickname(speaker): 89 | speaker_nickname = (caps_nickname_in_parenthesis.search(speaker)).group() 90 | speaker_nickname = text_formatting(speaker_nickname) 91 | speaker = re.sub(caps_nickname_in_parenthesis, '', speaker) #remove nickname 92 | return speaker, speaker_nickname 93 | 94 | 95 | # Keep separately the explanatory parenthesis text of the speaker 96 | def separate_explanatory_parenthesis(speaker): 97 | speaker_info = (text_in_parenthesis.search(speaker)).group() 98 | speaker = re.sub(text_in_parenthesis, '', speaker) #remove (text in parenthesis) 99 | return speaker, speaker_info 100 | 101 | 102 | def format_speaker_info(speaker_info): 103 | speaker_info = text_formatting(speaker_info) 104 | speaker_info = speaker_info.replace('υφυπ.',' υφυπουργος ') 105 | speaker_info = speaker_info.replace('υπ.',' υπουργος ') 106 | speaker_info = speaker_info.replace('&',' και ') 107 | speaker_info = re.sub('\s\s+' , ' ', speaker_info) #replace more than one spaces with one space 108 | speaker_info = speaker_info.lstrip() #remove leading spaces 109 | speaker_info = speaker_info.rstrip() #remove trailing spaces 110 | return speaker_info 111 | 112 | 113 | # compare temp max with similarity of the member's name alternatives with the speaker name 114 | def compare_with_alternative_sim(speaker_name, member_name, member_surname, temp_max, greek_names): 115 | 116 | # each row in the greek_names data is unique concerning the first name of the row 117 | # greek_names has only those names that have at least one alternative. so each line has at least two names 118 | for line in greek_names: 119 | 120 | name_list = (line.strip()).split(',') 121 | 122 | # if member name has alternatives 123 | if name_list[0]==member_name: 124 | 125 | # keep alternatives of the name 126 | name_list.remove(member_name) 127 | 128 | for alternative_name in name_list: 129 | alternative_sim1 = jellyfish.jaro_winkler_similarity(speaker_name, alternative_name+' '+member_surname) 130 | alternative_sim2 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname + ' ' + alternative_name) 131 | temp_max = max(temp_max,alternative_sim1, alternative_sim2) 132 | 133 | break #if true, break the for loop and proceed to return temp pax 134 | 135 | return temp_max 136 | 137 | 138 | def get_gov(current_record_datetime): 139 | 140 | df_govs = pd.read_csv('../out_files/governments_1989onwards.csv', encoding='utf-8') 141 | df_govs['date_from'] = pd.to_datetime(df_govs['date_from'])#.dt.date 142 | df_govs['date_to'] = pd.to_datetime(df_govs['date_to'])#.dt.date 143 | df_govs = df_govs.sort_values(by='date_from', ascending=True) 144 | current_gov_df = df_govs.loc[(df_govs.date_from <= current_record_datetime) & (current_record_datetime < df_govs.date_to)] 145 | 146 | if current_gov_df.shape[0] != 1: 147 | print('problem with ', current_record_datetime) 148 | 149 | item = current_gov_df.gov_name.iloc[0] + '(' + current_gov_df.date_from.iloc[0].strftime('%d/%m/%Y') +\ 150 | '-' + current_gov_df.date_to.iloc[0].strftime('%d/%m/%Y') + ')' 151 | 152 | return [item] 153 | 154 | 155 | def keep_roles_at_date(roles, current_record_datetime): 156 | 157 | new_roles = [] 158 | 159 | #assert type list 160 | if type(roles) != list: 161 | roles = ast.literal_eval(roles) 162 | 163 | for role in roles: 164 | role_name, role_dates = role.split('(') 165 | role_start_date, role_end_date = role_dates.replace(')', '').split('-') 166 | role_start_date = dt.strptime(role_start_date, '%d/%m/%Y') 167 | role_end_date = dt.strptime(role_end_date, '%d/%m/%Y') 168 | if role_start_date<=current_record_datetime<=role_end_date: 169 | new_roles.append(role) 170 | 171 | #if empty list 172 | if not new_roles: 173 | new_roles.append('βουλευτης') 174 | 175 | return new_roles 176 | 177 | 178 | def compute_max_similarity(speaker_name, speaker_nickname, member_name_part): 179 | 180 | if ( '(' in member_name_part and len(member_name_part.split(' '))>3 ) or ( '(' not in member_name_part and len(member_name_part.split(' '))>2): 181 | member_surname = member_name_part.split(' ')[0] 182 | member_name = member_name_part.split(' ')[2] 183 | else: # εξωκοινοβουλευτικος χωρις ονομα πατρος 184 | member_surname = member_name_part.split(' ')[1] 185 | member_name = member_name_part.split(' ')[0] 186 | 187 | temp_max = 0 188 | 189 | # put these transpositions in the beginning, before we remove '-' 190 | # If member has more than one first names 191 | if '-' in member_name: 192 | # there are cases like member name being δενδιας νικολαος-γεωργιος 193 | # and detected speaker being ΝΙΚΟΛΑΟΣ ΔΕΝΔΙΑΣ 194 | 195 | # if member has two first names 196 | if len(member_name.split('-'))==2: 197 | member_name1, member_name2 = member_name.split('-') 198 | 199 | # if member has three first names 200 | elif len(member_name.split('-'))==3: 201 | member_name1, member_name2, member_name3 = member_name.split('-') 202 | 203 | # if member has more than one first names and one surname 204 | if '-' not in member_surname: 205 | # do the following for two first names 206 | sim5 = jellyfish.jaro_winkler_similarity(speaker_name, member_name1 + ' '+member_surname) 207 | sim6 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname+' '+member_name1) 208 | sim7 = jellyfish.jaro_winkler_similarity(speaker_name, member_name2 + ' '+member_surname) 209 | sim8 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname+' '+member_name2) 210 | 211 | temp_max = max(temp_max, sim5, sim6, sim7, sim8) 212 | # Extra comparisons for alternative names of members 213 | temp_max = compare_with_alternative_sim(speaker_name, member_name1, member_surname, temp_max, greek_names) 214 | temp_max = compare_with_alternative_sim(speaker_name, member_name2, member_surname, temp_max, greek_names) 215 | 216 | 217 | # do the following extra for three first names 218 | # for example κουικ φιλιππου τερενς-σπενσερ-νικολαος 219 | if len(member_name.split('-'))==3: 220 | sim9 = jellyfish.jaro_winkler_similarity(speaker_name, member_name3 + ' '+member_surname) 221 | sim10 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname + ' '+member_name3) 222 | temp_max = max(temp_max, sim9, sim10) 223 | # Extra comparisons for alternative names of members 224 | temp_max = compare_with_alternative_sim(speaker_name, member_name3, 225 | member_surname, temp_max, 226 | greek_names) 227 | 228 | else: 229 | # If member has more than one first names and two surnames, compare each one separately 230 | member_surname1,member_surname2=member_surname.split('-') 231 | sim5 = jellyfish.jaro_winkler_similarity(speaker_name, member_name1 + ' '+member_surname1) 232 | sim6 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname1+' '+member_name1) 233 | sim7 = jellyfish.jaro_winkler_similarity(speaker_name, member_name1+' '+member_surname2) 234 | sim8 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname2+' '+member_name1) 235 | sim9 = jellyfish.jaro_winkler_similarity(speaker_name, member_name2+' '+member_surname1) 236 | sim10 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname1+' '+member_name2) 237 | sim11 = jellyfish.jaro_winkler_similarity(speaker_name, member_name2+' '+member_surname2) 238 | sim12 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname2+' '+member_name2) 239 | 240 | temp_max = max(temp_max, sim5, sim6, sim7, sim8, sim9, sim10, sim11, sim12) 241 | #there is no case with 3 first names and 2 last names, so we don't compute that 242 | 243 | # Extra comparisons for alternative names of members 244 | temp_max = compare_with_alternative_sim(speaker_name, member_name1, member_surname1, temp_max, greek_names) 245 | temp_max = compare_with_alternative_sim(speaker_name, member_name1, member_surname2, temp_max, greek_names) 246 | temp_max = compare_with_alternative_sim(speaker_name, member_name2, member_surname1, temp_max, greek_names) 247 | temp_max = compare_with_alternative_sim(speaker_name, member_name2, member_surname2, temp_max, greek_names) 248 | 249 | # If member has one first name and two surnames 250 | elif '-' in member_surname: 251 | member_surname1,member_surname2=member_surname.split('-') 252 | sim5 = jellyfish.jaro_winkler_similarity(speaker_name, member_name+' '+member_surname1) 253 | sim6 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname1+' '+member_name) 254 | sim7 = jellyfish.jaro_winkler_similarity(speaker_name, member_name+' '+member_surname2) 255 | sim8 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname2+' '+member_name) 256 | 257 | temp_max = max(temp_max, sim5, sim6, sim7, sim8) 258 | 259 | # Extra comparisons for alternative names of members 260 | temp_max = compare_with_alternative_sim(speaker_name, member_name, member_surname1, temp_max, greek_names) 261 | temp_max = compare_with_alternative_sim(speaker_name, member_name, member_surname2, temp_max, greek_names) 262 | 263 | #If member has available nickname and two surnames 264 | if lower_nickname_in_parenthesis.search(member_name_part) and speaker_nickname=='': 265 | 266 | member_nickname = re.sub ('[()]','',(lower_nickname_in_parenthesis.search(member_name_part)).group()) 267 | sim9 = jellyfish.jaro_winkler_similarity(speaker_name, member_nickname+' '+member_surname1) 268 | sim10 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname1+' '+member_nickname) 269 | sim11 = jellyfish.jaro_winkler_similarity(speaker_name, member_nickname+' '+member_surname2) 270 | sim12 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname2+' '+member_nickname) 271 | 272 | temp_max = max(temp_max, sim9, sim10, sim11, sim12) 273 | 274 | # Remove '-' for sim1, sim2 best comparisons 275 | member_name = member_name.replace('-', ' ') 276 | member_surname = member_surname.replace('-', ' ') 277 | 278 | #Make comparisons of speaker with members' names and reversed members' names 279 | sim1 = jellyfish.jaro_winkler_similarity(speaker_name, member_name+' '+member_surname) 280 | sim2 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname+' '+member_name) 281 | temp_max = max(temp_max,sim1,sim2) 282 | 283 | # Extra comparisons for alternative names of members 284 | temp_max = compare_with_alternative_sim(speaker_name, member_name, member_surname, temp_max, greek_names) 285 | 286 | #We compare speaker with member's nickname and surname 287 | if lower_nickname_in_parenthesis.search(member_name_part) and speaker_nickname=='': 288 | 289 | member_nickname = re.sub ('[()]','',(lower_nickname_in_parenthesis.search(member_name_part)).group()) 290 | sim3 = jellyfish.jaro_winkler_similarity(speaker_name, member_nickname+' '+member_surname) 291 | sim4 = jellyfish.jaro_winkler_similarity(speaker_name, member_surname+' '+member_nickname) 292 | 293 | temp_max = max(temp_max, sim3, sim4) 294 | 295 | return temp_max 296 | 297 | 298 | 299 | parser = ArgumentParser() 300 | parser.add_argument("-f", "--data_folder", 301 | help="relative path to folder of data batch", ) 302 | parser.add_argument("-o", "--outpath", 303 | help="out csv file relative path") 304 | parser.add_argument("-o2", "--outpath2", 305 | help="out csv file relative path2") 306 | args = parser.parse_args() 307 | datapath = args.data_folder 308 | f1_path = args.outpath 309 | f2_path = args.outpath2 310 | 311 | # Goal file with all members speeches 312 | f1 = open(f1_path, 'w+', encoding='utf-8', newline = '') 313 | 314 | members_df = pd.read_csv('../out_files/all_members_activity.csv', encoding='utf-8') 315 | 316 | fnames = open('../out_files/greek_names_alts_only.txt', 'r+', encoding='utf-8') 317 | greek_names = fnames.readlines() 318 | 319 | filenames = sorted([f for f in os.listdir(datapath) if not f.startswith('.')]) 320 | 321 | filename_freqs = defaultdict(int) 322 | 323 | record_counter = 0 324 | 325 | # REGULAR EXPRESSIONS 326 | #------------------------------------------ 327 | speaker_regex = re.compile(r"((\s*[Α-ΩΆ-ΏΪΫΪ́Ϋ́-]+)(\s+[Α-ΩΆ-ΏΪΫΪ́Ϋ́-]+)*\s*(\(.*?\))?\s*\:)") 328 | caps_nickname_in_parenthesis = re.compile(r"(\([Α-ΩΆ-ΏΪΫΪ́Ϋ́]+\))+") #(ΠΑΝΟΣ) 329 | lower_nickname_in_parenthesis = re.compile(r"(\([α-ω]{2,}\))") #(πανος) 330 | text_in_parenthesis = re.compile(r"(\(.*?\)){1}") #(Υπουργός Εσωτερικών) 331 | 332 | # Regex for both proedros or proedreuon 333 | proedr_regex = re.compile( 334 | r"(^(((Π+Ρ(Ο|Ό)+(Ε|Έ))|(Ρ(Ο|Ό)+(Ε|Έ)Δ)|(ΠΡ(Ε|Έ)(Ο|Ό))|(ΠΡ(Ο|Ό)Δ)|(Η ΠΡ(Ο|Ό)(Ε|Έ)ΔΡ)|(ΠΡ(Ε|Έ)Δ))|(ΠΡΟΣΩΡΙΝΗ ΠΡΟΕΔΡΟΣ)|(ΠΡΟΣΩΡΙΝΟΣ ΠΡΟΕΔΡΟΣ)))") 335 | 336 | # Regex for proedros only 337 | proedros_regex = re.compile(r"ΠΡ((Ο|Ό|(ΟΟ))(Ε|Έ)|((ΕΟ)|(ΈΟ)|(ΕΌ)|(ΈΌ)))ΔΡΟΣ") 338 | proedreuon_first_speaker = re.compile(r"((\s*[Α-ΩΆ-ΏΪΫΪ́Ϋ́-]+)(\s+\(([Α-ΩΆ-Ώα-ωά-ώϊϋΐΰΪΫΪ́Ϋ́-]\s*)+\))?\s*\:)$") 339 | general_member_regex = re.compile(r"((Β(Ο|Ό)(Υ|Ύ)(Ε|Έ)Λ)|(Β(Ο|Ό)(Υ|Ύ)Λ(Ε|Έ)(Υ|Ύ)?Τ[^(Α|Ά)]))") 340 | left_parenthesis_regex = re.compile(r"\(") 341 | right_parenthesis_regex = re.compile(r"\)") 342 | incomplete_nickname_parenthesis = re.compile(r"\([Α-ΩΆ-ΏΪΫΪ́]{3,}\s") 343 | sitting_terminated_regex = re.compile(r"λ(υ|ύ)εται\s+η\s+συνεδρ(ι|ί)αση") 344 | #------------------------------------------ 345 | 346 | csv_output = csv.writer(f1) 347 | 348 | # csv header 349 | csv_output.writerow(['member_name', 'sitting_date', 'parliamentary_period', 350 | 'parliamentary_session','parliamentary_sitting', 351 | 'political_party', 'government', 'member_region', 'roles', 'member_gender', 352 | 'speaker_info', 'speech']) 353 | 354 | # Open a file in order to write down the rows with no files 355 | prob_files = open('../out_files/files_with_content_problems_'+ 356 | os.path.basename(os.path.normpath(datapath))+'.txt','w+', 357 | encoding='utf-8') 358 | 359 | for filename in filenames: 360 | 361 | record_counter += 1 362 | print("File "+str(record_counter)+' from '+ str(len(filenames))+ ' '+filename) 363 | 364 | # Skip duplicate files 365 | new_name = '_'.join([p for p in filename.split('_') if p!=(filename.split('_')[1])]) 366 | 367 | filename_freqs[new_name]+=1 368 | if filename_freqs[new_name]>1: 369 | continue #with next iteration of for loop 370 | 371 | name_parts_without_extension = (os.path.splitext(filename)[0]).split('_') 372 | record_date = name_parts_without_extension[0] 373 | record_year = record_date.split('-')[0].strip() 374 | current_record_datetime = dt.strptime(record_date, '%Y-%m-%d') 375 | current_gov = get_gov(current_record_datetime) 376 | 377 | name_parts_cleaned = [re.sub("[()-]", ' ', part) for part in name_parts_without_extension] 378 | record_period = re.sub(r"\s\s+",' ',name_parts_cleaned[2].strip()) 379 | record_session = re.sub(r"\s\s+",' ',name_parts_cleaned[3].strip()) 380 | record_sitting = re.sub(r"\s\s+",' ',name_parts_cleaned[4].strip()) 381 | 382 | f3 = open(os.path.join(datapath+filename), 'r', encoding='utf-8') 383 | file_content = f3.read().replace('\n', '') 384 | file_content = re.sub("\s\s+" , " ", file_content) 385 | 386 | # Creates a list of tuples e.g. (' ΠΡΟΕΔΡΕΥΩΝ (Βαΐτσης Αποστολάτος):', ' ΠΡΟΕΔΡΕΥΩΝ', '', '(Βαΐτσης Αποστολάτος)') 387 | speakers_groups = re.findall( 388 | r"((\s*[Α-ΩΆ-ΏΪΫΪ́Ϋ́-]+)(\s+\([Α-ΩΆ-Ώα-ωά-ώϊϋΐΰΪΫΪ́Ϋ́-]+\))?(\s+[Α-ΩΆ-ΏΪΫΪ́Ϋ́]+)?(\s+[Α-ΩΆ-ΏΪΫΪ́Ϋ́-]+)*\s*(\(.*?\))?\s*\:)", 389 | file_content) 390 | 391 | # Keep only first full match case of findall 392 | speakers = [speaker[0] for speaker in speakers_groups] 393 | 394 | # Discard introductory text before first speaker 395 | # Use split with maxsplit number 1 in order to split at first occurrence 396 | try: 397 | file_content = file_content.split(speakers[0], 1)[1] 398 | except: 399 | prob_files.write(filename + " \n") 400 | continue # proceed to next iteration/filename 401 | 402 | for i in range(len(speakers)): 403 | 404 | # If not last speaker 405 | if i < (len(speakers)-1): 406 | speaker = speakers[i] 407 | speech,file_content = file_content.split(speakers[i+1], 1) 408 | else: 409 | speaker = speakers[i] 410 | speech = file_content 411 | 412 | # special treatment for first speaker who is usually proedreuon 413 | if i == 0: 414 | if proedreuon_first_speaker.search(speaker.strip()): 415 | speaker = proedreuon_first_speaker.search(speaker.strip()).group() 416 | 417 | # remove parenthesis text which is usually descriptions of procedures 418 | speech = re.sub(text_in_parenthesis, " ", speech) 419 | 420 | # Clean speaker 421 | speaker = speaker.strip() 422 | speaker = re.sub("\s\s+", " ", speaker) 423 | 424 | speaker_info=np.nan 425 | speaker_nickname='' 426 | 427 | #in case the speaker name is like "ΠΡΟΕΔΡΕΥΩΝ (Παναγιώτης Ν. Κρητικός):" 428 | # or like ΠΡΟΣΩΡΙΝΟΣ ΠΡΟΕΔΡΟΣ (Ιωάννης Τραγάκης): 429 | if proedr_regex.search(speaker): 430 | 431 | # Hand-picked wrong cases 432 | if any(mistaken in speaker for mistaken in ['ΤΗΛΕΦΩΝΟ', 'ΓΡΑΜΜΑΤΕΙΣ', 'ΠΡΟΕΚΟΠΗΣ']): 433 | continue # to next iteration/speaker 434 | 435 | # For proedreuon 436 | if not proedros_regex.search(speaker): 437 | speaker_info = 'προεδρευων' 438 | 439 | # For proedros 440 | else: 441 | # if the person in proedros 442 | if 'ΠΡΟΣΩΡΙΝ' in speaker: 443 | speaker_info = 'προσωρινος προεδρος' 444 | else: 445 | speaker_info = 'προεδρος' 446 | 447 | segments = speaker.split('(') 448 | speaker = ''.join(segments[1:]) 449 | 450 | # for cases where the name of the person is not mentioned 451 | if len(speaker)<3: 452 | speaker = np.nan 453 | party = np.nan 454 | speaker_gender = np.nan 455 | speaker_region = np.nan 456 | roles = np.nan 457 | csv_output.writerow([speaker, current_record_datetime.strftime('%d/%m/%Y'), 458 | record_period, record_session, record_sitting, 459 | party, current_gov, speaker_region, roles, 460 | speaker_gender, speaker_info, speech]) 461 | 462 | continue # to next iteration/speaker 463 | 464 | if speaker.startswith('ΜΑΡΤΥΣ'): 465 | speaker = speaker.replace('ΜΑΡΤΥΣ', '') 466 | speaker = re.sub("[()]",'', speaker) 467 | speaker_info = 'μαρτυς' 468 | 469 | if len(speaker)<3: #for cases where the name of the person is not mentioned 470 | speaker = np.nan 471 | party = np.nan 472 | speaker_gender = np.nan 473 | speaker_region = np.nan 474 | roles = np.nan 475 | csv_output.writerow([speaker, current_record_datetime.strftime('%d/%m/%Y'), record_period, 476 | record_session, record_sitting, party, current_gov, 477 | speaker_region, roles, speaker_gender, 478 | speaker_info, speech]) 479 | 480 | continue # to next iteration/speaker 481 | 482 | if general_member_regex.search(speaker): 483 | speaker = (re.sub("[():'’`΄‘.]", '', speaker)).lower() 484 | speaker = speaker.translate(str.maketrans('άέόώήίϊΐiύϋΰ', 'αεοωηιιιιυυυ')) 485 | if 'εφηβοι' in speaker: 486 | continue # to next speaker 487 | else: 488 | party = party_of_generic_reference(speaker) 489 | speaker = np.nan 490 | speaker_gender = np.nan 491 | speaker_region = np.nan 492 | 493 | # When the closing speech is assigned to generic members instead of the proedreuon 494 | # which is usually the case when proedreuon is not mentioned as the closing speaker 495 | # we remove the standard closing talk of the sitting from the generic members speech 496 | if sitting_terminated_regex.search(speech): 497 | speech = \ 498 | re.split("(μ|Μ)ε\s+(τη|την)\s+(συναινεση|συναίνεση)\s+του\s+((σ|Σ)(ω|ώ)ματος|(τ|Τ)μ(η|ή)ματος)", 499 | speech)[0] 500 | 501 | speaker_info = 'βουλευτης/ες' 502 | roles = np.nan 503 | csv_output.writerow([speaker, current_record_datetime.strftime('%d/%m/%Y'), record_period, 504 | record_session, record_sitting, party, current_gov, 505 | speaker_region, roles, speaker_gender, 506 | speaker_info, speech]) 507 | 508 | continue 509 | 510 | # continue 511 | if speaker != '': 512 | 513 | # Exclude very large malformed text that is not a speaker 514 | if len(speaker) < 200: 515 | speaker, speaker_nickname = separate_nickname_incomplete_parenthesis(speaker, speaker_nickname) 516 | 517 | if caps_nickname_in_parenthesis.search(speaker): 518 | speaker, speaker_nickname = separate_nickname(speaker) 519 | 520 | if text_in_parenthesis.search(speaker): 521 | speaker, speaker_info = separate_explanatory_parenthesis(speaker) 522 | speaker_info = format_speaker_info(speaker_info) 523 | 524 | speaker_name = text_formatting(speaker) 525 | speaker_name = speaker_name_corrections(speaker_name) 526 | 527 | # Remove 1-2 letter characters 528 | speaker_name = ' '.join([word for word in speaker_name.split(' ') if len(word) > 2]) 529 | 530 | max_sim = 0 531 | 532 | for index, row in members_df.iterrows(): 533 | 534 | member_start_date = dt.strptime(row.member_start_date, '%Y-%m-%d') 535 | member_end_date = dt.strptime(row.member_end_date, '%Y-%m-%d') 536 | 537 | if member_start_date <= current_record_datetime <= member_end_date: 538 | 539 | member_name_part = row.member_name 540 | member_party = row.political_party 541 | member_region = row.administrative_region 542 | member_gender = row.gender 543 | member_gov = row.government_name 544 | roles = ast.literal_eval(row.roles) 545 | 546 | temp_max = compute_max_similarity(speaker_name, speaker_nickname, member_name_part) 547 | 548 | if temp_max > max_sim: 549 | max_sim = temp_max 550 | max_member_name_part = member_name_part 551 | max_member_party = member_party 552 | max_member_region = member_region 553 | max_member_gender = member_gender 554 | max_member_roles = roles 555 | 556 | # Strict hand-picked similarity threshold to avoid false positives 557 | if max_sim > 0.95: 558 | 559 | max_member_roles = keep_roles_at_date(max_member_roles, current_record_datetime) 560 | 561 | csv_output.writerow([max_member_name_part, current_record_datetime.strftime('%d/%m/%Y'), 562 | record_period, record_session, record_sitting, max_member_party, 563 | current_gov, max_member_region, max_member_roles, max_member_gender, 564 | speaker_info, speech]) 565 | 566 | f3.close() 567 | 568 | prob_files.close() 569 | f1.close() 570 | 571 | df = pd.read_csv(f1_path, encoding='utf-8') 572 | 573 | # Remove in period column the string part '-presided-parliamentary-republic+' 574 | df['parliamentary_period'].replace({"-presided-parliamentary-republic_": '_'}, inplace=True, regex=True) 575 | 576 | # Correct order of sitting date column 577 | df['sitting_date'].replace({"-presided-parliamentary-republic_": '_'}, inplace=True, regex=True) 578 | 579 | if (df[df.apply(lambda r: r.str.contains('presided').any(), axis=1)]).shape[0] == 0: 580 | print('Check 3 ok') 581 | else: 582 | print('String \'presided\' is still somewhere in the data') 583 | 584 | # Replace date '93 with 1993 585 | df['parliamentary_session'].replace({"'": '19'}, inplace=True, regex=True) 586 | 587 | # Check if members with nan name have nan roles 588 | mask = ((df.member_name.isnull())) 589 | if str(set(df.loc[mask, 'roles'].to_list())) == '{nan}': 590 | print('Check 1 ok') 591 | else: 592 | print('Roles column of one or more nan member names are not nan') 593 | 594 | # Check if members with filled name have not nan roles 595 | mask = ((df.member_name.notnull())) 596 | if np.nan not in (df.loc[mask, 'roles'].to_list()): 597 | print('Check 2 ok') 598 | else: 599 | print('Entries with nan roles have filled member names when role should be \'βουλευτης\'') 600 | 601 | df.to_csv(f2_path, encoding='utf-8', index=False, na_rep=np.nan) 602 | 603 | endtime = dt.now() 604 | print('Comparison lasted from '+str(starttime)+' until '+str(endtime)+'.') 605 | -------------------------------------------------------------------------------- /src/gov_members_data_cleaner.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pygtrie 3 | import math 4 | import re 5 | import codecs 6 | import pandas as pd 7 | import numpy as np 8 | import datetime 9 | 10 | 11 | def explode(df, column_to_split): 12 | 13 | # all columns except `column_to_split` 14 | initial_cols = df.columns.to_list() 15 | other_cols = df.columns.difference([column_to_split], sort=False) 16 | 17 | # calculate length of list equal to number of separate roles 18 | lens = df[column_to_split].str.len() 19 | 20 | # repeat indexes as many times as the length of each list to split 21 | idx = np.repeat(df.index.values, lens) 22 | 23 | # populate rows of the other columns based on number of separate roles 24 | # keep old and repeated index values 25 | res = pd.DataFrame({ 26 | col:np.repeat(df[col].values, lens) 27 | for col in other_cols}, 28 | index=idx) 29 | 30 | # append the column with sorted separate roles 31 | res = res.assign(**{column_to_split:np.concatenate(df[column_to_split].values)}) 32 | res = res[initial_cols] 33 | # revert the original index order 34 | res = res.sort_index() 35 | 36 | # drop old index and create new 37 | res = res.reset_index(drop=True) 38 | 39 | return res 40 | 41 | 42 | def assert_filled_gender(df): 43 | 44 | for index, row in df.iterrows(): 45 | if pd.isnull(row['gender']): 46 | print(row.gender) 47 | if df['gender'].isnull().values.any()==True: 48 | print('Warning: some gender values ar NaN for the following member names...') 49 | print(df['cleaned_fullname'][df['gender'].isnull()]) 50 | else: 51 | print('All names matched to genders.') 52 | 53 | return 54 | 55 | 56 | def specific_corrections(df): 57 | 58 | d3 = {'στεφανος μανου': 'στεφανος μανος', 'αθανασιος νακου':'αθανασιος νακος', 59 | 'αλεξανδρος κοντου' : 'αλεξανδρος κοντος', 'αναργυρος φατουρου' : 'αναργυρος φατουρου', 60 | 'ανδρεας λοβερδου' : 'ανδρεας λοβερδος', 'ανδρεας λυκουρεντζου' : 'ανδρεας λυκουρεντζος', 61 | 'ανδρεας ξανθου' : 'ανδρεας ξανθος', 'γεωργιος αγαπητου' : 'γεωργιος αγαπητος', 62 | 'γεωργιος βερνικου' : 'γεωργιος βερνικος', 'γεωργιος ζαββου' : 'γεωργιος ζαββος', 63 | 'γεωργιος κατρουγκαλου' : 'γεωργιος κατρουγκαλος', 'γεωργιος κουμαντου' : 'γεωργιος κουμαντος', 64 | 'γεωργιος ντολιου' : 'γεωργιος ντολιος', 'γεωργιος ορφανου' : 'γεωργιος ορφανος', 65 | 'γεωργιος παπασταμκου' : 'γεωργιος παπασταμκος', 'γεωργιος ρωμαιου' : 'γεωργιος ρωμαιος', 66 | 'γεωργιος στυλιου' : 'γεωργιος στυλιος', 'γρηγοριος γιανναρου' : 'γρηγοριος γιανναρος', 67 | 'δημητριος αλαμπανου' : 'δημητριος αλαμπανος', 'δημητριος θανου' : 'δημητριος θανος', 68 | 'δημητριος καμμενου' : 'δημητριος καμμενος', 'δημητριος κρεμαστινου' : 'δημητριος κρεμαστινος', 69 | 'δημητριος λιακου' : 'δημητριος λιακος', 'δημητριος φατουρου' : 'δημητριος φατουρος', 70 | 'διονυσιος λιβανου' : 'διονυσιος λιβανος', 'ελευθεριος κρετσου' : 'ελευθεριος κρετσος', 71 | 'ευαγγελος λιβιερατου' : 'ευαγγελος λιβιερατος', 'ευαγγελος μαλεσιου' : 'ευαγγελος μαλεσιος', 72 | 'ευκλειδης τσακαλωτου' : 'ευκλειδης τσακαλωτος', 'ηλιας μοσιαλου' : 'ηλιας μοσιαλος', 73 | 'θεοδωρος γκαμαλετσου' : 'θεοδωρος γκαμαλετσος', 'θεοδωρος δαμιανου' : 'θεοδωρος δαμιανος', 74 | 'θεοδωρος κατριβανου' : 'θεοδωρος κατριβανος', 'θεοδωρος κολιοπανου' : 'θεοδωρος κολιοπανος', 75 | 'θεοδωρος λιβανιου' : 'θεοδωρος λιβανιος', 'θεοδωρος παγκαλου' : 'θεοδωρος παγκαλος', 76 | 'ιωαννης ανδριανου' : 'ιωαννης ανδριανος', 'ιωαννης γιαγκου' : 'ιωαννης γιαγκος', 77 | 'ιωαννης κουτσουκου' : 'ιωαννης κουτσουκος', 'ιωαννης παναρετου' : 'ιωαννης παναρετος', 78 | 'κωνσταντινος βρεττου' : 'κωνσταντινος βρεττος', 'κωνσταντινος κουκοδημου' : 'κωνσταντινος κουκοδημος', 79 | 'λουκας παπαδημου' : 'λουκας παπαδημος', 'παυλος γερουλανου' : 'παυλος γερουλανος', 80 | 'μιχαηλ γαλενιανου' : 'μιχαηλ γαλενιανος', 'νεκταριος σαντορινιου' : 'νεκταριος σαντορινιος', 81 | 'νικολαος κλειτου' : 'νικολαος κλειτος', 'νικολαος-λεανδρος λιναρδατου' : 'νικολαος-λεανδρος λιναρδατος', 82 | 'νικολαος-μιχαηλ αλιβιζατου' : 'νικολαος-μιχαηλ αλιβιζατος', 'παναγιωτης δελημητσου' : 'παναγιωτης δελημητσος', 83 | 'παναγιωτης καμμενου' : 'παναγιωτης καμμενος', 'παναγιωτης πικραμμενου' : 'παναγιωτης πικραμμενος', 84 | 'πετρος-παυλος αλιβιζατου' : 'πετρος-παυλος αλιβιζατος', 'σπυριδων ταλιαδουρου' : 'σπυριδων ταλιαδουρος', 85 | 'σωκρατης φαμελλου' : 'σωκρατης φαμελλος', 'σταυρος μπενου' : 'σταυρος μπενος', 86 | 'φιλιππος πετσαλνικου' : 'φιλιππος πετσαλνικος', 'φραγκουλης φραγκου' : 'φραγκουλης φραγκος', 87 | 'χρηστος ροκοφυλλου' : 'χρηστος ροκοφυλλος', 'χρηστος-γεωργιος σκερτσου' : 'χρηστος-γεωργιος σκερτσος', 88 | r'\bλιασκου\b': 'λιασκος', r'\bευρυπιδης\b':'ευριπιδης', r'\bιωαννης βαρουφακης\b': 'γιανης βαρουφακης', 89 | r'\bζουραρις\b': 'ζουραρης' 90 | } 91 | 92 | df['cleaned_fullname'] = df['cleaned_fullname'].replace(d3, regex=True) 93 | 94 | return df 95 | 96 | 97 | def first_name_formatting(df): 98 | 99 | # Convert full strings to genitive case 100 | d = {r'\bελπιδα\b': 'ελπιδας', r'\bαθανασια\b': 'αθανασιας', r'\bφωτεινη\b':'φωτεινης', 101 | r'\bιωαννου\b':'ιωαννη', r'\bαικατερινη\b':'αικατερινης', r'\bγεωργιος\b':'γεωργιου', 102 | r'\bπαναγιωτου\b':'παναγιωτη', r'\bξενοφωντος\b':'ξενοφωντα', r'\bβυρωνος\b': 'βυρωνα', 103 | r'\bπαντελεημονος\b': 'παντελεημονα', r'\bθεανους\b':'θεανως', r'\bσπυριδωνος\b|\bσπυριδων\b':'σπυριδωνα', 104 | r'\bσοφια\b': 'σοφιας', r'\bπαρασκευη\b': 'παρασκευης', r'\bοδυσσεας\b': 'οδυσσεα', 105 | r'\bπαναγιωτης\b': 'παναγιωτη', r'\bολγα\b': 'ολγας', r'\bμηχαηλ\b': 'μιχαηλ', 106 | r'\bλουκια-ταρσιτσα\b':'λουκιας-ταρσιτσας', r'\bμαρια\b': 'μαριας', 107 | r'\bκωνσταντινα\b': 'κωνσταντινας', r'\bαννα\b':'αννας', r'\bαλκηστις\b': 'αλκηστιδος', 108 | r'\bαδωνι\b': 'αδωνιδος', r'\bτρυφωνα\b': 'τρυφωνος', r'\bπροκοπη\b':'προκοπιου', 109 | r'\bευρυπιδης\b':'ευριπιδης', r'\bαντωνη\b':'αντωνιου', r'\bανασταση\b': 'αναστασιου' 110 | 111 | } 112 | 113 | # regex False exact full string match, 114 | # regex True substrings replaced unless use \bstring\b in first parenthesis. regex only in first parenthesis 115 | df['member_first_name'] = df['member_first_name'].replace(d, regex=True) 116 | 117 | return df 118 | 119 | 120 | def name_formatting_dataframe(df): 121 | 122 | df['member_name'] = df['member_name'].str.replace(r'\x96', '-') 123 | df['member_name'] = df['member_name'].str.replace(r'\s+\-\s+', '-') 124 | 125 | # Add whitespace before parenthesis. (\S) matches any non-whitespace character. 126 | df['member_name'] = df['member_name'].str.replace(r"(\S)\(", r'\1 (') 127 | # remove father's name in parenthesis 128 | regex_fatherName = re.compile('(^.*)(\s\((του)\s.*?\)$)') 129 | df['member_name'] = df['member_name'].str.replace(regex_fatherName, r'\1') 130 | 131 | # rearrange order of string parts with capture groups. move parentheses to end of name 132 | regex_parenthesis = re.compile('(^.*)(\s\(.*?\))(.*$)') 133 | df['member_name'] = df['member_name'].str.replace( 134 | regex_parenthesis, r'\1\3\2') 135 | 136 | df['member_name'] = df['member_name'].str.replace( 137 | r'( γ\. )|( α\. )|( οθ\. )|( συζ\.δημητριου )|( σπ\. )|( σπ\.)', ' ') 138 | df['member_name'] = df['member_name'].str.replace( 139 | '\(πρωθυπουργου \)', '') 140 | df['member_name'] = df['member_name'].str.replace( 141 | '\(πρωθυπουργου \)', '') 142 | df['member_name'] = df['member_name'].str.replace( 143 | r'αποστολ\.|αποστ\.|αποστολουυ', 'αποστολου') 144 | 145 | # Convert substrings to genitive case and make corrections 146 | d1 = {r'\(βασω\)':'(βασως)', r'\(κελλυ\)':'(κελλυς)', r'\bκαμμενο\b':'καμμενου', r'\bγαλελιανου\b':'γαλενιανου', 147 | r'\bκαχριμακη\b':'καρχιμακη', r'\bξενογιαννακο\-πουλου\b':'ξενογιαννακοπουλου', 148 | r'\bβασιλικη παπανδρεου\b':'βασιλικης παπανδρεου', r'\bσπηλιοτοπουλου\b':'σπηλιωτοπουλου', 149 | r'\bκαλατζακου\b':'καλαντζακου', r'\bγενηματα\b':'γεννηματα', 150 | r'\bψαρουδας\b':'ψαρουδα' 151 | } 152 | 153 | # replace substring first parenthesis accepts regex, 154 | # second accepts string so we don't need to escape the parenthesis symbol 155 | df['member_name'] = df['member_name'].replace(d1, regex=True) 156 | 157 | # Convert full strings to genitive case 158 | d2 = {'τζαννη τζανετακη': 'τζαννη τζαννετακη', 'φανη παλλη-πετραλια': 'φανης παλλη-πετραλια', 159 | 'αναστασιο λιασκο':'αναστασιου λιασκου','αμαλια-μαρια μερκουρη (μελινα)':'αμαλιας-μαριας μερκουρη (μελινας)', 160 | 'αγγελικη γκερεκου':'αγγελικης γκερεκου', 'αγγελικη-ευφροσυνη κιαου':'αγγελικης-ευφροσυνης κιαου-δημακου', 161 | 'αλεξανδρος χαριτσης': 'αλεξανδρου χαριτση', 'γεωργιου σουλφια':'γεωργιου σουφλια', 162 | 'ιωαννη ανδριανο':'ιωαννη ανδριανου', 'νικολαου λιναρδατου':'νικολαου-λεανδρου λιναρδατου', 163 | 'καλλιοπης μπουρδαρα':'καλλιοπης μπουρδαρα (κελλυς)', 'ευαγγελου βανιζελου':'ευαγγελου βενιζελου', 164 | 'βασιλικης παπανδρεου': 'βασιλικης παπανδρεου (βασως)', 'αντωνιος ρουπακιωτης':'αντωνιου ρουπακιωτη', 165 | 'ιωαννη λιαππη':'ιωαννη λιαπη', 'ιωαννη κεφαλλογιαννη':'ιωαννη κεφαλογιαννη', 166 | 'αναστασιου λιασκος': 'αναστασιου λιασκου', 'φωτη κουβελη':'φωτιου-φανουριου κουβελη' 167 | } 168 | 169 | # regex False exact full string match, regex True substrings replaced 170 | df['member_name'] = df['member_name'].replace(d2, regex=False) 171 | 172 | df['member_name'] = df['member_name'].str.replace('κων/νου', 173 | 'κωνσταντινου') 174 | df['member_name'] = df['member_name'].str.replace('ανασταστιου', 175 | 'αναστασιου') 176 | df['member_name'] = df['member_name'].str.replace( 177 | 'νικολ\.φωτιου χατζημιχαλη', 'νικολαου-φωτιου χατζημιχαλη') 178 | df['member_name'] = df['member_name'].str.replace( 179 | r'(μαρια-κολλια τσαρουχα)|(μαριας κολλια τσαρουχα)', 180 | 'μαριας κολλια-τσαρουχα') 181 | df['member_name'] = df['member_name'].str.replace( 182 | r'(αποστολου-αθαν\. τσοχατζοπουλου)|(αποστολου-αθανας\. τσοχατζοπουλου)|' 183 | r'(αποστολουυ-αθαν\.τσοχατζοπουλου)|(αποστολουυ-αθανασιου τσοχατζοπουλου)|' 184 | r'(αποστολου-αθαν\.τσοχατζοπουλου)', 185 | 'αποστολου-αθανασιου τσοχατζοπουλου') 186 | 187 | df['member_name'] = df['member_name'].str.strip() 188 | 189 | return df 190 | 191 | 192 | def format_member_role(role): 193 | 194 | convert_roles = {'πρωθυπουργου': 'πρωθυπουργος', 'υπουργου' :'υπουργος', 195 | 'αναπληρωτη': 'αναπληρωτης', 'αναπληρωτριας': 'αναπληρωτης', 196 | 'αναπληρωτου': 'αναπληρωτης', 'υπουργουυγειας,': 'υπουργος υγειας ', 197 | 'υφυπουργου': 'υφυπουργος', 'υφυπουγου': 'υφυπουργος', 198 | 'αντιπροεδρου': 'αντιπροεδρος'} 199 | 200 | parts = role.split() 201 | for i, part in enumerate(parts): 202 | if part in convert_roles.keys(): 203 | parts[i] = convert_roles[part] 204 | 205 | new_role = ' '.join(parts) 206 | new_role = re.sub(r',[^\s]', ', ', new_role) 207 | new_role = re.sub(',', ' ', new_role) 208 | new_role = re.sub(r'\s\s+', ' ', new_role) 209 | 210 | new_role = new_role.replace('πε.χω.δε', 'περιβαλλοντος χωροταξιας και δημοσιων εργων') 211 | new_role = new_role.replace('υπουργος βιομηχανιας ενεργειας και τεχνολογιας και εμποριου', 212 | 'υπουργος βιομηχανιας ενεργειας και τεχνολογιας και υπουργος εμποριου') 213 | new_role = new_role.replace('\x91', '') 214 | new_role = new_role.replace('μ.μ.ε', 'μεσων μαζικης ενημερωσης') 215 | new_role = new_role.replace('διιοικησης', 'διοικησης') 216 | new_role = new_role.replace('δημοσια ταξης', 'δημοσιας ταξης') 217 | new_role = new_role.replace('εσωτερικων εσωτερικων', 'εσωτερικων') 218 | new_role = new_role.replace('υπουργος αναπληρωτης', 'αναπληρωτης υπουργος') 219 | new_role = re.sub(r'δημ.διοικησης|ημ.διοικησης|δημ. διοικησης', 'δημοσιας διοικησης', new_role) 220 | new_role = new_role.replace(' ημοσιας', ' δημοσιας') 221 | new_role = new_role.replace('εςωτερικων', 'εσωτερικων') 222 | new_role = new_role.replace('δημο-σιων', 'δημοσιων') 223 | new_role = new_role.replace('&', 'και') 224 | new_role = new_role.replace(' π εριβαλλοντος', ' περιβαλλοντος') 225 | new_role = new_role.replace('παιδειας και θρησκευματων πολιτισμου και αθλητισμου', 226 | 'παιδειας θρησκευματων πολιτισμου και αθλητισμου') 227 | new_role = new_role.replace('πολιτιςμου', 'πολιτισμου') 228 | new_role = new_role.replace('μακεδονιας και θρακης', 'μακεδονιας-θρακης') 229 | new_role = new_role.replace('χαρτο-φυλακιο', 'χαρτοφυλακιο') 230 | new_role = new_role.replace('περιβαλλοντοςκαι', 'περιβαλλοντος και') 231 | new_role = new_role.replace('ενη-μερωσης', 'ενημερωσης') 232 | new_role = new_role.replace('μακεδονιας θρακης', 'μακεδονιας-θρακης') 233 | new_role = new_role.replace('ανθρωπινωνδικαιωματων', 'ανθρωπινων δικαιωματων') 234 | new_role = new_role.replace('αναπληρωτης υπουργος ναυτιλιας και νησιωτικης πολιτικης αγροτικης αναπτυξης και τροφιμων', 235 | 'αναπληρωτης υπουργος ναυτιλιας και νησιωτικης πολιτικης') 236 | new_role = new_role.replace('τεχνολο-γιας', 'τεχνολογιας') 237 | 238 | new_role = new_role.replace('γεωγιας', 'γεωργιας') 239 | new_role = re.sub(r'\s\s+', ' ', new_role) 240 | new_role = re.sub(r'(,|\.)$', '', new_role) 241 | new_role = new_role.strip() 242 | 243 | return new_role 244 | 245 | 246 | def correct_specific_roles(df, name, role_before, role_after, date, gov_date_from): 247 | 248 | mask = (df.cleaned_fullname == name) \ 249 | & (df.member_role == role_before) \ 250 | & (df.date == date) \ 251 | & (df.gov_date_from == gov_date_from) 252 | 253 | df.loc[mask, 'member_role'] = role_after 254 | 255 | return df 256 | 257 | 258 | def correct_specific_events(df, name, role, event_before, event_after, date, 259 | gov_date_from): 260 | 261 | mask = (df.cleaned_fullname == name) \ 262 | & (df.event == event_before) \ 263 | & (df.date == date) \ 264 | & (df.member_role == role) & \ 265 | (df.gov_date_from == gov_date_from) 266 | df.loc[mask, 'event'] = event_after 267 | 268 | return df 269 | 270 | 271 | def text_formatting(text): 272 | 273 | text = re.sub("['’`΄‘́̈]",'', text) 274 | text = re.sub('\t+' , ' ', text) 275 | text = text.lstrip() 276 | text = text.rstrip() 277 | text = re.sub('\s\s+' , ' ', text) 278 | text = re.sub('\s*(-|–)\s*' , '-', text) #fix dashes 279 | text = text.lower() 280 | text = text.translate(str.maketrans('άέόώήίϊΐiύϋΰ','αεοωηιιιιυυυ')) #remove accents 281 | # convert english characters to greek 282 | text = text.translate(str.maketrans('akebyolruxtvhmnz','ακεβυολρυχτνημνζ')) 283 | 284 | return text 285 | 286 | 287 | def json_file_to_chartrie(json_file): 288 | 289 | with codecs.open(json_file, 'r+', encoding='utf-8') as f: 290 | parsed = f.read() 291 | formatted_text = text_formatting(parsed) 292 | json_dict = json.loads(formatted_text) 293 | 294 | trie = pygtrie.CharTrie() 295 | 296 | for key, value in json_dict.items(): 297 | trie[key] = value 298 | 299 | return trie 300 | 301 | 302 | def find_nominative_and_gender(search_term, gender, tries, surname_search = False): 303 | 304 | if pd.isnull(search_term): 305 | return search_term, gender 306 | else: 307 | if surname_search == False or gender=='male' or pd.isnull(gender): 308 | 309 | parts_of_name = {} 310 | for part in search_term.split('-'): 311 | 312 | # return the same if nothing is found 313 | matched_name, matched_gender = part, gender 314 | first_half = part[0:(math.ceil(len(part) / 2))] 315 | foundintrie = False 316 | 317 | for trie_gender, trie in tries.items(): 318 | 319 | if trie.has_subtrie(first_half): 320 | 321 | for name, info in trie.iteritems(prefix=first_half): 322 | 323 | if len(name) > ((2 * len(first_half)) + 2) or name=='νικολαους': 324 | continue 325 | 326 | if type(info['ενικος']['γενικη']) == str and part == info['ενικος']['γενικη']: 327 | 328 | matched_name, matched_gender = name, trie_gender 329 | foundintrie = True 330 | break 331 | 332 | elif type(info['ενικος']['γενικη']) == list and part in info['ενικος']['γενικη']: 333 | matched_name, matched_gender = name, trie_gender 334 | foundintrie = True 335 | break 336 | 337 | # if it is male surname 338 | if surname_search == True and foundintrie == False: 339 | if not (part.endswith('ου') or (part[-1] in ['ς', 'λ', 'τ', 'κ'])): 340 | matched_name = part+'ς' 341 | 342 | parts_of_name[matched_name] = matched_gender 343 | 344 | # if all genders are the same 345 | genders = list(set([g for g in parts_of_name.values() if not pd.isnull(g)])) 346 | if len(genders) <= 1: 347 | 348 | matched_name = '-'.join(parts_of_name.keys()) 349 | # sets don't support indexing or slicing 350 | matched_gender = genders[0] if len(genders)==1 else np.nan 351 | 352 | else: 353 | print('Error with genders of name parts of ', search_term) 354 | print(parts_of_name) 355 | 356 | # if we are searching for the nominative of a female surname, it is the same 357 | else: 358 | matched_name, matched_gender = search_term, gender 359 | 360 | return matched_name, matched_gender 361 | 362 | 363 | def assert_balanced_events_for_each_role(df): 364 | 365 | start_events = ['διορισμος', 'αρχη_κυβερνησης'] 366 | end_events = ['παραιτηση', 'παυση', 'απεβιωσε', 'last_script_run', 'τελος_κυβερνησης'] 367 | balanced = True 368 | 369 | for name, subdf in df.groupby(['cleaned_fullname', 'gov_date_from', 'member_role']): 370 | 371 | start_events_subdf = subdf.loc[subdf.event.isin(start_events)] 372 | end_events_subdf = subdf.loc[(subdf.event.isin(end_events))] 373 | 374 | if start_events_subdf.shape[0] != end_events_subdf.shape[0]: 375 | balanced = False 376 | 377 | return balanced 378 | 379 | # PREPARE DATAFRAME 380 | df = pd.read_csv('../out_files/original_gov_members_data.csv', encoding='utf-8') 381 | df = name_formatting_dataframe(df) 382 | 383 | df[['member_first_name','member_last_name', 'nickname']] = df['member_name'].\ 384 | str.split(" ", expand=True).fillna(value=np.nan) 385 | 386 | df = first_name_formatting(df) 387 | df['nickname'] = df['nickname'].str.replace(r'[\(\)]', '') 388 | df['gender'] = np.nan 389 | 390 | # CREATE TRIES FOR NAME SEARCH 391 | print('Creating tries for name search...') 392 | # Female surnames remain the same in all grammatical cases, so no trie is created for them 393 | male_name_trie = json_file_to_chartrie('../out_files/wiki_data/male_name_cases_populated.json') 394 | female_name_trie = json_file_to_chartrie('../out_files/wiki_data/female_name_cases_populated.json') 395 | male_surname_trie = json_file_to_chartrie('../out_files/wiki_data/male_surname_cases_populated.json') 396 | 397 | name_tries = {'male':male_name_trie, 'female':female_name_trie} 398 | surname_tries = {'male': male_surname_trie} 399 | 400 | # FIND NOMINATIVE CASE AND GENDER 401 | print('Searching for nominative cases and gender of member names...') 402 | 403 | # Create df because only on df (not series/column) you can apply a custom function that returns two variables 404 | first_name_gender_df = df['member_first_name'].to_frame().join(df['gender']) 405 | df[['member_first_name', 'gender']] = first_name_gender_df.apply(lambda x: find_nominative_and_gender( 406 | x['member_first_name'], x['gender'], name_tries, surname_search=False), axis=1, result_type="expand") 407 | 408 | nickname_gender_df = df['nickname'].to_frame().join(df['gender']) 409 | df[['nickname', 'gender']] = nickname_gender_df.apply(lambda x: find_nominative_and_gender( 410 | x['nickname'], x['gender'], name_tries, surname_search=False), axis=1, result_type="expand") 411 | 412 | df["nickname"] = df["nickname"].apply(lambda x: '('+x+')' if not pd.isnull(x) else '') 413 | 414 | last_name_gender_df = df['member_last_name'].to_frame().join(df['gender']) 415 | df[['member_last_name', 'gender']] = last_name_gender_df.apply(lambda x: find_nominative_and_gender( 416 | x['member_last_name'], x['gender'], surname_tries, surname_search=True), axis=1, result_type="expand") 417 | 418 | # JOIN 419 | name_cols = ['member_first_name', 'member_last_name', 'nickname'] # 420 | df['cleaned_fullname'] = df[name_cols].agg(' '.join, axis=1).str.strip() 421 | df = specific_corrections(df) 422 | 423 | df['member_role'] = df['member_role'].apply(format_member_role) 424 | 425 | print('Making specific corrections in data...') 426 | 427 | # SPECIFIC CORRECTIONS 428 | #---------------- 429 | # role corrections before "exploding" roles, due to mistaken data 430 | df = correct_specific_roles(df, 'τζαννης τζαννετακης', 'υπουργος εξωτερικων και τουρισμου', 431 | 'υπουργος τουρισμου', '1989-07-03', '1989-07-02') 432 | df = correct_specific_roles(df, 'γεωργιος γεννηματας', 'υπουργος εθνικης οικονομιας και οικονομικων', 433 | 'υπουργος εθνικης οικονομιας και υπουργος οικονομικων', '1993-10-13', '1993-10-13') 434 | df = correct_specific_roles(df, 'γεωργιος κοντογεωργης', 'υπουργος εθνικης οικονομιας και τουρισμου', 435 | 'υπουργος εθνικης οικονομιας και υπουργος τουρισμου','1989-10-12', '1989-10-12') 436 | df = correct_specific_roles(df, 'στεφανος μανος', 'υπουργος εθνικης οικονομιας και οικονομικων', 437 | 'υπουργος εθνικης οικονομιας και υπουργος οικονομικων','1993-10-13', '1990-04-11') 438 | df = correct_specific_roles(df, 'γιαννος παπαντωνιου', 'υπουργος εθνικης οικονομιας και οικονομικων', 439 | 'υπουργος εθνικης οικονομιας και υπουργος οικονομικων','2000-04-13', '1996-09-25') 440 | #---------------- 441 | 442 | # look ahead assertion in regex 443 | df['member_role'] = df['member_role'].\ 444 | str.split(r'\s(?=και υφυπουργ|και υπουργ|και αναπληρωτ|και πρωθυπουργ|και αντιπροεδρ)') 445 | df = explode(df, 'member_role') 446 | df['member_role'] = df['member_role'].str.replace(r'^και\s', '') 447 | 448 | #---------------- 449 | 450 | ''' missing rows 451 | Format of a row: [date, event, member_name, member_role, gov_date_from, gov_date_to, 452 | gov_name, first, last, gender, nickname, cleaned_fullname]''' 453 | new_rows = [ 454 | ['2012-03-27', 'παραιτηση', 'φωτεινης γεννηματα', 'αναπληρωτης υπουργος εσωτερικων', 455 | '2011-11-11', '2012-05-17', 'παπαδημου λουκα δ.', 456 | 'φωτεινη', 'γεννηματα', np.nan, 'female', 'φωτεινη γεννηματα'], 457 | ['1996-09-25', 'παραιτηση', 'σημιτη κωνσταντινου', 'πρωθυπουργος', 458 | '1996-01-22', '1996-09-25', 'κωνσταντινου σημιτη', 459 | 'κωνσταντινος', 'σημιτης', np.nan, 'male', 'κωνσταντινος σημιτης'], 460 | ['1999-09-14', 'απεβιωσε', 'γιαννου κρανιδιωτη', 'αναπληρωτης υπουργος εξωτερικων', 461 | '1996-09-25', '2000-04-13', 'σημιτη κωνσταντινου', 462 | 'γιαννος', 'κρανιδιωτης', np.nan,'male', 'γιαννος κρανιδιωτης'], 463 | ['2012-06-21', 'τελος_κυβερνησης', 'γεωργιου ζανια', 'υπουργος οικονομικων', 464 | '2012-05-17', '2012-06-21', 'πικραμμενου παναγιωτη οθ. (υπηρεσιακη)', 465 | 'γεωργιος', 'ζανιας', np.nan, 'male', 'γεωργιος ζανιας'], 466 | ['2012-06-21', 'αρχη_κυβερνησης', 'γεωργιου ζανια', 'υπουργος οικονομικων', 467 | '2012-06-21','2015-01-26','σαμαρα κ. αντωνιου', 468 | 'γεωργιος', 'ζανιας', np.nan, 'male', 'γεωργιος ζανιας'], 469 | ['2012-06-21', 'αρχη_κυβερνησης', 'γεωργιου ζανια', 'υπουργος οικονομικων', 470 | '2012-06-21','2015-01-26','σαμαρα κ. αντωνιου', 471 | 'γεωργιος', 'ζανιας', np.nan, 'male', 'γεωργιος ζανιας'] 472 | ] 473 | 474 | rows_df = pd.DataFrame(new_rows, columns=df.columns) 475 | df = df.append(rows_df, ignore_index=True) 476 | 477 | #---------------- 478 | 479 | # Correct member roles based on a legal change by "Ν. 1943/1991" https://gslegal.gov.gr/?p=1304 480 | # string column to timestamp 481 | df['gov_date_from']= pd.to_datetime(df['gov_date_from']) 482 | df['date']= pd.to_datetime(df['date']) 483 | 484 | gov_date_from = datetime.datetime.strptime('1990-04-11', '%Y-%m-%d') 485 | event_date = datetime.datetime.strptime('1991-04-11', '%Y-%m-%d') 486 | 487 | ministers_without_portfolio_1990 = df.loc[ 488 | (df.gov_date_from == gov_date_from) & (df.member_role == 'υπουργος χωρις χαρτοφυλακιο')].copy() 489 | 490 | for name, subdf in ministers_without_portfolio_1990.groupby(['cleaned_fullname']): 491 | if subdf.shape[0] == 1 and subdf['event'].iloc[0] == 'διορισμος': 492 | copy = subdf.copy() 493 | new_subdf = copy.append(copy, ignore_index=True) 494 | # we can use .at because index is reset 495 | new_subdf['event'].at[0] = 'παραιτηση' 496 | new_subdf['date'].at[0] = event_date - datetime.timedelta(days=1) 497 | new_subdf['event'].at[1] = 'διορισμος' 498 | new_subdf['date'].at[1] = event_date 499 | new_subdf['member_role'].at[1] = 'υπουργος επικρατειας' 500 | df = pd.concat([df, new_subdf], ignore_index=True) 501 | else: 502 | print('Error in minsters without portfolio in 1990.') 503 | 504 | #---------------- 505 | 506 | # event corrections due to mistaken data 507 | df = correct_specific_events(df, 'γεωργιος κουμουτσακος', 'αναπληρωτης υπουργος μεταναστευσης και ασυλου', 508 | 'παραιτηση', 'διορισμος', '2020-01-15', '2019-07-08') 509 | df = correct_specific_events(df, 'παναγιωτης μηταρακης', 'υπουργος μεταναστευσης και ασυλου', 510 | 'παραιτηση', 'διορισμος', '2020-01-15', '2019-07-08') 511 | df = correct_specific_events(df, 'κωνσταντινος κουκοδημος', 'υφυπουργος παιδειας και θρησκευματων', 512 | 'διορισμος', 'παραιτηση', '2014-09-02', '2012-06-21') 513 | 514 | #---------------- 515 | 516 | # corrections/changes due to renaming of ministries during a term of office 517 | # keeping the first name of the ministry 518 | df = correct_specific_roles(df, 'αναστασιος λιασκος', 'υφυπουργος τουριστικης αναπτυξης', 519 | 'υφυπουργος τουρισμου', '2006-02-15', '2004-03-10') 520 | 521 | df = correct_specific_roles(df, 'ανδρεας ξανθος', 'αναπληρωτης υπουργος υγειας', 522 | 'αναπληρωτης υπουργος υγειας και κοινωνικων ασφαλισεων','2015-08-27', '2015-01-26') 523 | 524 | df = correct_specific_roles(df, 'αποστολος φωτιαδης', 'υφυπουργος οικονομιας και οικονομικων', 525 | 'υφυπουργος οικονομικων', '2004-03-10', '2000-04-13') 526 | 527 | df = correct_specific_roles(df, 'γεωργιος φλωριδης', 'υφυπουργος οικονομιας και οικονομικων', 528 | 'υφυπουργος οικονομικων', '2003-07-07', '2000-04-13') 529 | 530 | df = correct_specific_roles(df, 'δημητριος αβραμοπουλος', 'υπουργος τουριστικης αναπτυξης', 531 | 'υπουργος τουρισμου','2006-02-15', '2004-03-10') 532 | 533 | df = correct_specific_roles(df, 'δημητριος στρατουλης', 'αναπληρωτης υπουργος υγειας', 534 | 'αναπληρωτης υπουργος υγειας και κοινωνικων ασφαλισεων','2015-03-21', '2015-01-26') 535 | 536 | df = correct_specific_roles(df, 'δημητριος στρατουλης', 'αναπληρωτης υπουργος εργασιας κοινωνικης ασφαλισης', 537 | 'αναπληρωτης υπουργος εργασιας κοινωνικης ασφαλισης και κοινωνικης αλληλεγγυης', 538 | '2015-03-21', '2015-01-26') 539 | 540 | df = correct_specific_roles(df, 'θεανω φωτιου', 'αναπληρωτης υπουργος εργασιας κοινωνικης ασφαλισης και κοινωνικης αλληλεγγυης', 541 | 'αναπληρωτης υπουργος εργασιας και κοινωνικης αλληλεγγυης','2015-08-27', '2015-01-26') 542 | 543 | df = correct_specific_roles(df, 'χρηστος παχτας', 'υφυπουργος οικονομιας και οικονομικων', 544 | 'υφυπουργος εθνικης οικονομιας','2004-01-26', '2000-04-13') 545 | 546 | df = correct_specific_roles(df, 'φιλιππος σαχινιδης', 'υφυπουργος οικονομικων', 547 | 'υφυπουργος οικονομιας ανταγωνιστικοτητας και ναυτιλιας','2011-06-17', '2009-10-06') 548 | 549 | df = correct_specific_roles(df, 'παναγιωτης σκουρλετης', 'υπουργος εργασιας κοινωνικης ασφαλισης και κοινωνικης αλληλεγγυης', 550 | 'υπουργος εργασιας και κοινωνικης αλληλεγγυης','2015-07-18', '2015-01-26') 551 | 552 | df = correct_specific_roles(df, 'παναγιωτης κουρουμπλης', 'υπουργος υγειας', 553 | 'υπουργος υγειας και κοινωνικων ασφαλισεων','2015-08-27', '2015-01-26') 554 | 555 | df = correct_specific_roles(df, 'κωνσταντινος μουσουρουλης', 'υπουργος ναυτιλιας και αιγαιου', 556 | 'υπουργος ναυτιλιας','2013-06-25', '2012-06-21') 557 | 558 | df = correct_specific_roles(df, 'νικολαος χριστοδουλακης', 'υπουργος οικονομιας και οικονομικων', 559 | 'υπουργος εθνικης οικονομιας και οικονομικων','2004-03-10', '2000-04-13') 560 | 561 | df = correct_specific_roles(df, 'ουρανια αντωνοπουλου', 562 | 'αναπληρωτης υπουργος εργασιας κοινωνικης ασφαλισης και κοινωνικης αλληλεγγυης', 563 | 'αναπληρωτης υπουργος εργασιας και κοινωνικης αλληλεγγυης','2015-08-27', '2015-01-26') 564 | 565 | #---------------- 566 | 567 | assert_filled_gender(df) 568 | df.drop_duplicates(inplace=True) 569 | df['gov_date_to'] = pd.to_datetime(df['gov_date_to']) 570 | 571 | # Correct last government's role ending dates 572 | last_gov_events = df.loc[(df.gov_date_from == df.gov_date_from.max())].copy() 573 | for name, subdf in last_gov_events.groupby(['cleaned_fullname', 'member_role']): 574 | if subdf.shape[0] == 1 and subdf['event'].iloc[0] == 'διορισμος': 575 | copy = subdf.copy() 576 | # we use .iat because index is not reset 577 | copy['event'].iat[0] = 'last_script_run' 578 | copy['date'].iat[0] = copy['gov_date_to'].iat[0] 579 | df = pd.concat([df, copy], ignore_index=True) 580 | 581 | #---------------- 582 | 583 | # group by cleaned_fullname, gov_date_from balanced events 584 | df['date']= pd.to_datetime(df['date']) 585 | df = df.sort_values(by='date', ascending=True) 586 | df.drop_duplicates(inplace=True) 587 | 588 | balanced = assert_balanced_events_for_each_role(df) 589 | 590 | final_list = [] 591 | 592 | #change format of date e.g. from 2001-01-27 to 27/01/2001 593 | df.date = df.date.dt.strftime('%d/%m/%Y') 594 | 595 | start_events = ['διορισμος', 'αρχη_κυβερνησης'] 596 | end_events = ['παραιτηση', 'παυση', 'απεβιωσε', 'last_script_run', 597 | 'τελος_κυβερνησης'] 598 | 599 | # match assignment and resignation dates for each role 600 | for name, subdf in df.groupby(['cleaned_fullname','gov_date_from']): 601 | 602 | subdf = subdf.sort_values(by='date', ascending=True) 603 | 604 | for role in set(subdf.member_role.to_list()): 605 | 606 | role_subdf = subdf.loc[(subdf.member_role==role)] 607 | assignment_date = role_subdf.loc[(role_subdf.event.isin(start_events)),'date'].values[0] 608 | resignation_date = role_subdf.loc[(role_subdf.event.isin(end_events)),'date'].values[0] 609 | 610 | final_list.append([subdf.iloc[0].cleaned_fullname, role, assignment_date, resignation_date, subdf.iloc[0].gender]) 611 | 612 | final_df = pd.DataFrame(final_list, columns = ['member_name', 'role', 'role_start_date', 'role_end_date', 'gender']) 613 | 614 | final_df.to_csv('../out_files/formatted_roles_gov_members_data.csv', encoding='utf-8', index=False) 615 | print('\nCreated file formatted_roles_gov_members_data.csv with columns ', str(final_df.columns)) 616 | --------------------------------------------------------------------------------