├── .gitignore ├── README.md ├── data ├── WV6_Codebook_v_2014_11_07.xls ├── codebook.csv ├── codebook_raw.csv └── headers.csv ├── db.py ├── import.py ├── import.sh ├── output └── .placeholder ├── process.sh ├── questions.md ├── requirements.txt ├── summarize_agreement.py └── summarize_questions.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | data/codebook_raw.csv 3 | data/codebook.csv 4 | data/WV6_Data_ascii_v_2015_04_18.dat 5 | data/WV6_Data_ascii_v_2015_04_18-clean.csv 6 | output/* 7 | *.pyc 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # World Values Data parser 2 | 3 | A data processing rig for the [World Values Survey](http://www.worldvaluessurvey.org/wvs.jsp). Currently tested only with WVS Wave 6 (2010-2014). 4 | 5 | Have a Mac and need help getting the requirements installed? [Read our guide](http://blog.apps.npr.org/2013/06/06/how-to-setup-a-developers-environment.html)! 6 | 7 | # Installation 8 | 9 | ## System requirements 10 | 11 | * bash 12 | * Python + pip 13 | * PostgreSQL 14 | 15 | ## Python requirements 16 | 17 | 18 | Install project requirements (preferably in a virtualenv): 19 | 20 | ``` 21 | pip install -r requirements.txt 22 | ``` 23 | 24 | ## Download the data 25 | 26 | The data cannot be redistributed, so you'll have to go [download it](http://www.worldvaluessurvey.org/WVSDocumentationWV6.jsp). Follow the *WV6_Data_ascii_delimited_v_2015_04_18 (delimited with comma)* link, unzip the file, and copy `WV6_Data_ascii_v_2015_04_18.dat` into this projects `data` folder. 27 | 28 | # Processing all the data 29 | 30 | To import the data and summarize it the way we do, run: 31 | 32 | ``` 33 | ./process.sh 34 | ``` 35 | 36 | This will generate summary output for all the questions NPR analyzed in our reporting. The output will go in the `output` directory in CSV format. 37 | 38 | # Import the data and process yourself 39 | 40 | First, run the importer: 41 | 42 | ``` 43 | ./import.sh 44 | ``` 45 | 46 | Now, you can create summaries for individual questions by calling: 47 | 48 | ``` 49 | ./summarize_questions.py 50 | ``` 51 | 52 | So if you were interested in the responses per-country to question v52 (*"A university education is more important for a boy than for a girl."*), you would run: 53 | 54 | ``` 55 | ./summarize_questions.py v52 56 | ``` 57 | 58 | The summary output will be in `output/v52.csv`. 59 | 60 | You can pass multiple questions, just separate them with space: 61 | 62 | ``` 63 | ./summarize_questions.py v52 v54 v60 64 | ``` 65 | -------------------------------------------------------------------------------- /data/WV6_Codebook_v_2014_11_07.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nprapps/worldvalues/346f83d1199be8c1f4a1522e319673c6b68deeae/data/WV6_Codebook_v_2014_11_07.xls -------------------------------------------------------------------------------- /data/headers.csv: -------------------------------------------------------------------------------- 1 | "v1","v2","v2a","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31","v32","v33","v34","v35","v36","v37","v38","v39","v40","v41","v42","v43","v44","v44_es","v45","v46","v47","v48","v49","v50","v51","v52","v53","v54","v55","v56","v56_nz","v57","v58","v59","v60","v61","v62","v63","v64","v65","v66","v67","v68","v69","v70","v71","v72","v73","v74","v74b","v75","v76","v77","v78","v79","v80","v81","v82","v83","v84","v85","v86","v87","v88","v89","v90","v91","v92","v93","v94","v95","v96","v97","v98","v99","v100","v101","v102","v103","v104","v105","v106","v107","v108","v109","v110","v111","v112","v113","v114","v115","v116","v117","v118","v119","v120","v121","v122","v123","v124","v125_00","v125_01","v125_02","v125_03","v125_04","v125_05","v125_06","v125_07","v125_08","v125_09","v125_10","v125_11","v125_12","v125_13","v125_14","v125_15","v125_16","v125_17","v126","v127","v128","v129","v130","v131","v132","v133","v134","v135","v136","v137","v138","v139","v140","v141","v142","v143","v144","v145","v146","v147","v148","v149","v150","v151","v152","v153","v154","v155","v156","v157","v158","v159","v160","v161","v162","v163","v164","v165","v166","v167","v168","v169","v160a","v160b","v160c","v160d","v160e","v160f","v160g","v160h","v160i","v160j","v170","v171","v172","v173","v174","v175","v176","v177","v178","v179","v180","v181","v182","v183","v184","v185","v186","v187","v188","v189","v190","v191","v192","v193","v194","v195","v196","v197","v198","v199","v200","v201","v202","v203","v203a","v204","v205","v206","v207","v207a","v208","v209","v210","v211","v212","v213","v214","v215_01","v215_02","v215_03","v215_04","v215_05","v215_06","v215_07","v215_08","v215_10","v215_11","v215_12","v215_13","v215_14","v215_15","v215_16","v215_17","v215_18","v216","v217","v218","v219","v220","v221","v222","v223","v224","v218_esma","v217_esma","v219_esma","v220_esma","v221_esma","v222_esma","v223_esma","v224_esma","v225","v226","v227","v228","v228_2","v228a","v228b","v228c","v228d","v228e","v228f","v228g","v228h","v228i","v228j","v228k","v229","v230","v231","v232","v233","v234","v235","v236","v237","v238","v239","v240","v241","v242","v243","v243_au","v244","v244_au","v245","v246","v247","v248","v248_cs","v249","v250","v251","v252","v253","v253_cs","v254","v255","v256","v256b","v256c","v256_map","v257","v258","v258a","v260","v261","v262","v263","v264","v265","y001","y002","y003","mn_35a","mn_163a","mn_163b","mn_163c","mn_228l","mn_228m","mn_228n","mn_228o","mn_228p","mn_228q","mn_228r","mn_228s1","mn_228s2","mn_228s3","mn_228s4","mn_228s5","mn_228s6","mn_228s7","mn_228s8","mn_229a","mn_229b","mn_230a","mn_233a","mn_233b","mn_234a","mn_237a","mn_237b1","mn_237b2","mn_237b3","mn_237b4","mn_237b5","mn_237b6","mn_237b7","mn_237c1","mn_237c2","mn_237c3","mn_237c4","mn_237c5","mn_237c6","mn_249a1","mn_249a2","mn_249a3","sacsecval","secvalwgt","resemaval","weightb","i_authority","i_nationalism","i_devout","defiance","weight1a","i_religimp","i_religbel","i_religprac","disbelief","weight2a","i_norm1","i_norm2","i_norm3","relativism","weight3a","i_trustarmy","i_trustpolice","i_trustcourts","scepticism","weight4a","i_indep","i_imagin","i_nonobed","autonomy","weight1b","i_womjob","i_wompol","i_womedu","equality","weight2b","i_homolib","i_abortlib","i_divorlib","choice","weight3b","i_voice1","i_voice2","i_voi2_00","voice","weight4b" 2 | -------------------------------------------------------------------------------- /db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import dataset 4 | from collections import OrderedDict 5 | 6 | POSTGRES_URL = 'postgresql:///worldvalues' 7 | db = dataset.connect(POSTGRES_URL) 8 | 9 | 10 | def initialize_counts(question_id): 11 | """ 12 | Initialize counts for summary 13 | """ 14 | table = db['categories'] 15 | categories = table.find(question_id=question_id) 16 | category_counts = OrderedDict() 17 | 18 | for category in categories: 19 | category_counts[category['value']] = 0 20 | 21 | return category_counts 22 | 23 | 24 | def query(question_id): 25 | """ 26 | Query for raw results from a question ID 27 | """ 28 | table = db['codebook'] 29 | question = table.find_one(question_id=question_id) 30 | 31 | result = db.query(""" 32 | select 33 | countries.value as country, c.value as response 34 | from 35 | survey_responses r 36 | join 37 | (select * from categories where question_id='{0}') c on r.{0}=c.code 38 | join 39 | (select * from categories where question_id='v2a') countries on r.v2a=countries.code 40 | order by 41 | country 42 | ; 43 | """.format(question_id)) 44 | 45 | return question, list(result) 46 | 47 | 48 | def get_country_list(): 49 | """ 50 | Return list of countries 51 | """ 52 | countries = [] 53 | table = db['categories'] 54 | country_result = list(table.find(question_id='v2a', order_by='value')) 55 | for row in country_result: 56 | countries.append(row['value']) 57 | 58 | return countries 59 | -------------------------------------------------------------------------------- /import.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import csv 5 | import dataset 6 | from collections import OrderedDict 7 | 8 | POSTGRES_URL = 'postgresql:///worldvalues' 9 | db = dataset.connect(POSTGRES_URL) 10 | 11 | QUESTION_TYPES = ( 12 | ('mentioned', """1##Mentioned 13 | 2##Not mentioned"""), 14 | ('agree_3way', """1##Agree 15 | 2##Neither 16 | 3##Disagree"""), 17 | ('agree_4way', """1##Agree strongly 18 | 2##Agree 19 | 3##Disagree 20 | 4##Strongly disagree"""), 21 | ('likert', """2##2 22 | 3##3 23 | 4##4 24 | 5##5 25 | 6##6 26 | 7##7 27 | 8##8 28 | 9##9"""), 29 | ) 30 | 31 | 32 | def clean_data(): 33 | """ 34 | Clean data and create schema 35 | """ 36 | 37 | schema_created = False 38 | 39 | cleaned_file = open('data/WV6_Data_ascii_v_2015_04_18-clean.csv', 'w') 40 | writer = csv.writer(cleaned_file, quoting=csv.QUOTE_ALL) 41 | 42 | input_file = open('data/WV6_Data_ascii_v_2015_04_18.dat') 43 | reader = csv.reader(input_file) 44 | 45 | raw_headers = reader.next() 46 | 47 | # Process rows 48 | id = 1 49 | for row in reader: 50 | # Add row to cleaned CSV 51 | row.insert(0, id) 52 | writer.writerow(row) 53 | id += 1 54 | 55 | # Create database schema from first row 56 | if not schema_created: 57 | create_schema(row, raw_headers) 58 | schema_created = True 59 | 60 | 61 | def create_schema(row, raw_headers): 62 | """ 63 | Create schema from a single row of data 64 | """ 65 | table = db['survey_responses'] 66 | 67 | # Get pre-cached headers 68 | with open('data/headers.csv') as f: 69 | reader = csv.reader(f) 70 | headers = reader.next() 71 | 72 | # Clean row 73 | processed_row = [] 74 | for column in row[1:]: 75 | processed_row.append(str(column)) 76 | 77 | # Insert row into tabel 78 | processed_dict = OrderedDict(zip(headers, processed_row)) 79 | table.insert(processed_dict) 80 | 81 | # Clear out table 82 | db.query('delete from survey_responses') 83 | 84 | 85 | def load_codebook(): 86 | codebook_table = db['codebook'] 87 | category_table = db['categories'] 88 | with open('data/codebook.csv') as f: 89 | rows = list(csv.DictReader(f)) 90 | 91 | for row in rows: 92 | question_type = None 93 | for potential_question_type, categories in QUESTION_TYPES: 94 | if categories in row['CATEGORIES']: 95 | question_type = potential_question_type 96 | break 97 | 98 | question = OrderedDict(( 99 | ('question_id', row['VAR'].lower()), 100 | ('question', row['QUESTION']), 101 | ('label', row['LABEL']), 102 | ('question_type', question_type), 103 | )) 104 | db_id = codebook_table.insert(question) 105 | 106 | categories = row['CATEGORIES'].splitlines() 107 | for category in categories: 108 | try: 109 | code, middle_value, real_value = category.split('#') 110 | except ValueError: 111 | print 'skipped {0} due to country specific code'.format(row['VAR']) 112 | category_row = OrderedDict(( 113 | ('db_id', db_id), 114 | ('question_id', row['VAR'].lower()), 115 | ('code', str(code)), 116 | ('value', str(real_value)), 117 | )) 118 | category_table.insert(category_row) 119 | 120 | 121 | if __name__ == '__main__': 122 | load_codebook() 123 | clean_data() 124 | -------------------------------------------------------------------------------- /import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Create database" 4 | dropdb --if-exists worldvalues 5 | createdb worldvalues 6 | 7 | echo "Convert codebook from xls to csv format" 8 | in2csv data/WV6_Codebook_v_2014_11_07.xls > data/codebook_raw.csv 9 | tail -n +4 data/codebook_raw.csv > data/codebook.csv 10 | 11 | echo "Set up tables and import codebook" 12 | ./import.py 13 | 14 | echo "Import all the data" 15 | psql worldvalues -c "COPY survey_responses FROM '`pwd`/data/WV6_Data_ascii_v_2015_04_18-clean.csv' DELIMITER ',' CSV;" 16 | -------------------------------------------------------------------------------- /output/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nprapps/worldvalues/346f83d1199be8c1f4a1522e319673c6b68deeae/output/.placeholder -------------------------------------------------------------------------------- /process.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Import data" 4 | ./import.sh 5 | 6 | echo "Summarize World Values" 7 | ./summarize_agreement.py 8 | ./summarize_questions.py > output/question_index.txt 9 | 10 | -------------------------------------------------------------------------------- /questions.md: -------------------------------------------------------------------------------- 1 | #V12-V22. what qualities do you encourage in your children? 2 | 3 | # V45. When jobs are scarce, men should have more right to a job than women. 4 | 5 | # V47. If a woman earns more money than her husband, it's almost certain to cause problems 6 | 7 | # V48. Having a job is the best way for a woman to be an independent person. 8 | 9 | # V50. When a mother works for pay, the children suffer. 10 | 11 | # V51. On the whole, men make better political leaders than women do. 12 | 13 | # V52. A university education is more important for a boy than for a girl. 14 | 15 | # V53. On the whole, men make better business executives than women do. 16 | 17 | # V54. Being a housewife is just as fulfilling as working for pay 18 | 19 | # V80. I¹m going to read out some problems. Please indicate which of the following problems you consider the most serious one for the world as a whole? (Discrimination against girls and women) 20 | 21 | # V123. I am going to name a number of organizations. For each one, could you tell me how much confidence you have in them: (Women¹s organizations) 22 | 23 | # V139. Please tell me for each of the following things how essential you think it is as a characteristic of democracy. (Women have the same rights as men.) 24 | 25 | # V168. Companies that employ young people perform better than those that employ people of different ages. 26 | 27 | # V182. To what degree are you worried about the following situations? (Not being able to give my children a good education) 28 | 29 | # V203A. Prostitution 30 | 31 | # V204. Abortion 32 | 33 | # V205. Divorce 34 | 35 | # V206. Sex before marriage 36 | 37 | # V207. Suicide 38 | 39 | # V208. For a man to beat his wife 40 | 41 | # V209. Parents beating children 42 | 43 | # V240. Sex of respondent 44 | 45 | # V241. Respondent's birth year. 46 | 47 | # V242. Age 48 | 49 | # V250. Do you live with your parents? 50 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==0.7.7 2 | csvkit==0.9.1 3 | dataset==0.6.0 4 | dbf==0.94.3 5 | jdcal==1.0 6 | Mako==1.0.1 7 | MarkupSafe==0.23 8 | normality==0.2.2 9 | openpyxl==2.2.0b1 10 | psycopg2==2.6.1 11 | python-dateutil==2.2 12 | PyYAML==3.11 13 | six==1.9.0 14 | SQLAlchemy==1.0.8 15 | Unidecode==0.4.18 16 | wheel==0.24.0 17 | xlrd==0.9.4 18 | -------------------------------------------------------------------------------- /summarize_agreement.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import dataset 5 | 6 | from db import query, initialize_counts, get_country_list 7 | from collections import OrderedDict 8 | 9 | 10 | ANALYSIS_QUESTIONS = ['v52', 'v45', 'v51'] 11 | ANALYSIS_COUNTRIES = ['India', 'Pakistan', 'Nigeria', 'China', 'Brazil', 'United States'] 12 | 13 | 14 | def _get_counts(result, question_id): 15 | counts = OrderedDict() 16 | for row in result: 17 | if not row['country'] in counts.keys(): 18 | counts[row['country']] = initialize_counts(question_id) 19 | counts[row["country"]][row["response"]] += 1 20 | return counts 21 | 22 | 23 | def process_mentioned(question, result, countries): 24 | counts = _get_counts(result, question['question_id']) 25 | key = '{0} {1} (% mentioned)'.format(question['question_id'], question['label']) 26 | 27 | for country, data in countries.items(): 28 | data[key] = None 29 | 30 | for country, results in counts.items(): 31 | if country not in countries.keys(): 32 | continue 33 | 34 | total = 0 35 | 36 | for count in results.values(): 37 | total += count 38 | 39 | countries[country][key] = float(results['Mentioned']) / float(total) 40 | 41 | 42 | def process_agree_3way(question, result, countries): 43 | counts = _get_counts(result, question['question_id']) 44 | key = '{0} {1} (% agree)'.format(question['question_id'], question['label']) 45 | 46 | for country, data in countries.items(): 47 | data[key] = None 48 | 49 | for country, results in counts.items(): 50 | if country not in countries.keys(): 51 | continue 52 | 53 | total = 0 54 | 55 | for count in results.values(): 56 | total += count 57 | 58 | countries[country][key] = float(results['Agree']) / float(total) 59 | 60 | 61 | def process_agree_4way(question, result, countries): 62 | counts = _get_counts(result, question['question_id']) 63 | key = '{0} {1} (% agree strongly and agree)'.format(question['question_id'], question['label']) 64 | 65 | for country, data in countries.items(): 66 | data[key] = None 67 | 68 | for country, results in counts.items(): 69 | if country not in countries.keys(): 70 | continue 71 | 72 | total = 0 73 | 74 | for count in results.values(): 75 | total += count 76 | 77 | countries[country][key] = (float(results['Agree']) + float(results['Agree strongly'])) / float(total) 78 | 79 | 80 | def process_likert(question, result, countries): 81 | counts = _get_counts(result, question['question_id']) 82 | key = '{0} {1} (% favorable [#5-#10])'.format(question['question_id'], question['label']) 83 | 84 | for country, data in countries.items(): 85 | data[key] = None 86 | 87 | for country, results in counts.items(): 88 | if country not in countries.keys(): 89 | continue 90 | 91 | total = 0 92 | 93 | for count in results.values(): 94 | total += count 95 | 96 | favorable = sum(results.values()[5:10]) 97 | 98 | countries[country][key] = float(favorable) / float(total) 99 | 100 | 101 | def summarize_agreement(): 102 | """ 103 | Summarize agreement levels 104 | """ 105 | country_list = get_country_list() 106 | countries = OrderedDict() 107 | for country in country_list: 108 | if country in ANALYSIS_COUNTRIES: 109 | countries[country] = OrderedDict((('country', country),)) 110 | 111 | for question_id in ANALYSIS_QUESTIONS: 112 | question, result = query(question_id) 113 | 114 | if question['question_type'] == 'mentioned': 115 | process_mentioned(question, result, countries) 116 | 117 | if question['question_type'] == 'agree_3way': 118 | process_agree_3way(question, result, countries) 119 | 120 | if question['question_type'] == 'agree_4way': 121 | process_agree_4way(question, result, countries) 122 | 123 | if question['question_type'] == 'likert': 124 | process_likert(question, result, countries) 125 | 126 | dataset.freeze(countries.values(), format='csv', filename='output/agreement_summary.csv') 127 | 128 | 129 | if __name__ == '__main__': 130 | summarize_agreement() 131 | -------------------------------------------------------------------------------- /summarize_questions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import dataset 5 | import sys 6 | 7 | from collections import OrderedDict 8 | from db import query, initialize_counts 9 | 10 | 11 | ANALYSIS_QUESTIONS = ['v{0}'.format(i) for i in range(12,23)] 12 | ANALYSIS_QUESTIONS += ['v45', 'v47', 'v48'] 13 | ANALYSIS_QUESTIONS += ['v{0}'.format(i) for i in range(50,55)] 14 | ANALYSIS_QUESTIONS += ['v80', 'v123', 'v139', 'v145', 'v147', 'v148', 'v152', 'v168', 'v182', 'v203a'] 15 | ANALYSIS_QUESTIONS += ['v{0}'.format(i) for i in range(204,210)] 16 | ANALYSIS_QUESTIONS += ['v{0}'.format(i) for i in range(240,243)] 17 | ANALYSIS_QUESTIONS += ['v250'] 18 | 19 | 20 | def summarize_question(question_id): 21 | """ 22 | Summarize responses for a given question ID 23 | """ 24 | 25 | question, result = query(question_id) 26 | print '{0}: {1}'.format(question_id, question['label']) 27 | 28 | counts = OrderedDict() 29 | 30 | for row in result: 31 | if not row['country'] in counts.keys(): 32 | counts[row['country']] = initialize_counts(question_id) 33 | 34 | counts[row["country"]][row["response"]] += 1 35 | 36 | output = [] 37 | for country, values in counts.items(): 38 | output_row = OrderedDict((('country', country),)) 39 | total = 0 40 | for label, value in values.items(): 41 | total += int(value) 42 | 43 | output_row['total_responses'] = total 44 | 45 | for label, value in values.items(): 46 | output_row[label] = value 47 | pct_label = '{0} pct'.format(label.encode('ascii', 'ignore').decode('ascii')) 48 | output_row[pct_label] = float(value) / total 49 | 50 | output.append(output_row) 51 | 52 | dataset.freeze(output, format='csv', filename='output/{0}.csv'.format(question_id)) 53 | 54 | if __name__ == '__main__': 55 | if len(sys.argv) > 1: 56 | questions = sys.argv[1:] 57 | else: 58 | questions = ANALYSIS_QUESTIONS 59 | 60 | for question_id in questions: 61 | summarize_question(question_id) 62 | --------------------------------------------------------------------------------