├── .gitignore
├── README.md
├── configs
    ├── caliskan.json
    ├── compare_embeddings.json
    └── histwords.json
├── images
    ├── google_news_reddit.png
    └── inst_weap_science_art.png
├── read_config.py
├── requirements.txt
├── results
    └── caliskan.json
├── results_compare_embeddings.json
├── results_histwords.json
├── run_tests.py
├── sgns-to-txt.py
└── weat.py


/.gitignore:
--------------------------------------------------------------------------------
1 | embeddings/
2 | *.pyc
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # compare-embedding-bias
 2 | 
 3 | Compare bias in word embeddings (over time, using different algorithms, using different corpora, before/after debiasing) using Word Embedding Association Tests (WEATs). Results are stored as JSON -- examples of graphing these results can be found in this colab notebook: 
 4 | https://colab.research.google.com/drive/1WNdOOmEenxtDhG-PRJ3K79HXBzZ-Nt-Q
 5 | 
 6 | The WEAT statistic was developed by Caliskan et al. https://purehost.bath.ac.uk/ws/portalfiles/portal/168480066/CaliskanEtAl_authors_full.pdf
 7 | 
 8 | ![Compare biases in Google News vs Reddit](images/google_news_reddit.png)
 9 | 
10 | ![Compare biases over time](images/inst_weap_science_art.png)
11 | 
12 | 
13 | # Requirements:
14 | - Python 3 
15 | - Gensim 
16 | - Numpy 
17 | - cPickle
18 | - json
19 | - scipy
20 | 
21 | Install with:
22 | 
23 |   $ pip install -r requirements.txt
24 |   
25 | ## Quick Start
26 | 
27 | ### Replicate Caliskan Results
28 | 
29 | 1. Download the the word embedding used in the original research paper: Word2Vec Google News pretrained embeddings https://code.google.com/archive/p/word2vec/
30 | 
31 | 2. Place the embedding inside a directory (EX: `embeddings`)
32 | 
33 | ```
34 | python run_tests.py embeddings/GoogleNews-vectors-negative300.bin
35 | ```
36 | 
37 | ## Embeddings
38 | 
39 | ### Examples of word embeddings to examine:
40 | 
41 | Word2Vec: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
42 | 
43 | GloVe (Common Crawl 840B): http://nlp.stanford.edu/data/glove.840B.300d.zip
44 | 
45 | GloVe (Twitter 2B): http://nlp.stanford.edu/data/glove.twitter.27B.zip
46 | 
47 | ### Example of word embeddings to examine bias over time:
48 | 
49 | HistWords: https://nlp.stanford.edu/projects/histwords/
50 | 
51 | *Note:* the original histwords word embeddings are not compatible with gensim. Run sgns-to-txt to convert to a format compatible with gensim.
52 | 
53 | Example:
54 | 
55 | ```
56 | python sgns-to-txt.py embeddings/sgns-fiction
57 | ```
58 | 
59 | # Example Commands
60 | ```
61 | python weat.py configs/histwords.json
62 | python weat.py configs/compare_embeddings.json 
63 | ```
64 | 


--------------------------------------------------------------------------------
/configs/caliskan.json:
--------------------------------------------------------------------------------
  1 | 	{ "tests":
  2 | 		{
  3 | 			"Flowers vs Insects (25) / Pleasant vs Unpleasant (25)":{
  4 | 				"targets_desc":"Flowers vs Insects (25)",
  5 | 				"attributes_desc":"Pleasant vs Unpleasant (25)",
  6 | 				"X":[
  7 | 					"aster",
  8 | 					"clover",
  9 | 					"hyacinth",
 10 | 					"marigold",
 11 | 					"poppy",
 12 | 					"azalea",
 13 | 					"crocus",
 14 | 					"iris",
 15 | 					"orchid",
 16 | 					"rose",
 17 | 					"bluebell",
 18 | 					"daffodil",
 19 | 					"lilac",
 20 | 					"pansy",
 21 | 					"tulip",
 22 | 					"buttercup",
 23 | 					"daisy",
 24 | 					"lily",
 25 | 					"peony",
 26 | 					"violet",
 27 | 					"carnation",
 28 | 					"gladiola",
 29 | 					"magnolia",
 30 | 					"petunia",
 31 | 					"zinnia"
 32 | 				],
 33 | 				"Y":[
 34 | 					"ant",
 35 | 					"caterpillar",
 36 | 					"flea",
 37 | 					"locust",
 38 | 					"spider",
 39 | 					"bedbug",
 40 | 					"centipede",
 41 | 					"fly",
 42 | 					"maggot",
 43 | 					"tarantula",
 44 | 					"bee",
 45 | 					"cockroach",
 46 | 					"gnat",
 47 | 					"mosquito",
 48 | 					"termite",
 49 | 					"beetle",
 50 | 					"cricket",
 51 | 					"hornet",
 52 | 					"moth",
 53 | 					"wasp",
 54 | 					"blackfly",
 55 | 					"dragonfly",
 56 | 					"horsefly",
 57 | 					"roach",
 58 | 					"weevil"
 59 | 				],
 60 | 				"A":[
 61 | 					"caress",
 62 | 					"freedom",
 63 | 					"health",
 64 | 					"love",
 65 | 					"peace",
 66 | 					"cheer",
 67 | 					"friend",
 68 | 					"heaven",
 69 | 					"loyal",
 70 | 					"pleasure",
 71 | 					"diamond",
 72 | 					"gentle",
 73 | 					"honest",
 74 | 					"lucky",
 75 | 					"rainbow",
 76 | 					"diploma",
 77 | 					"gift",
 78 | 					"honor",
 79 | 					"miracle",
 80 | 					"sunrise",
 81 | 					"family",
 82 | 					"happy",
 83 | 					"laughter",
 84 | 					"paradise",
 85 | 					"vacation"
 86 | 				],
 87 | 				"B":[
 88 | 					"abuse",
 89 | 					"crash",
 90 | 					"filth",
 91 | 					"murder",
 92 | 					"sickness",
 93 | 					"accident",
 94 | 					"death",
 95 | 					"grief",
 96 | 					"poison",
 97 | 					"stink",
 98 | 					"assault",
 99 | 					"disaster",
100 | 					"hatred",
101 | 					"pollute",
102 | 					"tragedy",
103 | 					"divorce",
104 | 					"jail",
105 | 					"poverty",
106 | 					"ugly",
107 | 					"cancer",
108 | 					"kill",
109 | 					"rotten",
110 | 					"vomit",
111 | 					"agony",
112 | 					"prison"
113 | 				]
114 | 			},
115 | 			"Instruments vs Weapons /  Pleasant vs Unpleasant":{
116 | 				"X":[
117 | 					"bagpipe",
118 | 					"cello",
119 | 					"guitar",
120 | 					"lute",
121 | 					"trombone",
122 | 					"banjo",
123 | 					"clarinet",
124 | 					"harmonica",
125 | 					"mandolin",
126 | 					"trumpet",
127 | 					"bassoon",
128 | 					"drum",
129 | 					"harp",
130 | 					"oboe",
131 | 					"tuba",
132 | 					"bell",
133 | 					"fiddle",
134 | 					"harpsichord",
135 | 					"piano",
136 | 					"viola",
137 | 					"bongo",
138 | 					"flute",
139 | 					"horn",
140 | 					"saxophone",
141 | 					"violin"
142 | 				],
143 | 				"Y":[
144 | 					"arrow",
145 | 					"club",
146 | 					"gun",
147 | 					"missile",
148 | 					"spear",
149 | 					"ax",
150 | 					"dagger",
151 | 					"harpoon",
152 | 					"pistol",
153 | 					"sword",
154 | 					"blade",
155 | 					"dynamite",
156 | 					"hatchet",
157 | 					"rifle",
158 | 					"tank",
159 | 					"bomb",
160 | 					"firearm",
161 | 					"knife",
162 | 					"shotgun",
163 | 					"teargas",
164 | 					"cannon",
165 | 					"grenade",
166 | 					"mace",
167 | 					"slingshot",
168 | 					"whip"
169 | 				],
170 | 				"A":[
171 | 					"caress",
172 | 					"freedom",
173 | 					"health",
174 | 					"love",
175 | 					"peace",
176 | 					"cheer",
177 | 					"friend",
178 | 					"heaven",
179 | 					"loyal",
180 | 					"pleasure",
181 | 					"diamond",
182 | 					"gentle",
183 | 					"honest",
184 | 					"lucky",
185 | 					"rainbow",
186 | 					"diploma",
187 | 					"gift",
188 | 					"honor",
189 | 					"miracle",
190 | 					"sunrise",
191 | 					"family",
192 | 					"happy",
193 | 					"laughter",
194 | 					"paradise",
195 | 					"vacation"
196 | 				],
197 | 				"B":[
198 | 					"abuse",
199 | 					"crash",
200 | 					"filth",
201 | 					"murder",
202 | 					"sickness",
203 | 					"accident",
204 | 					"death",
205 | 					"grief",
206 | 					"poison",
207 | 					"stink",
208 | 					"assault",
209 | 					"disaster",
210 | 					"hatred",
211 | 					"pollute",
212 | 					"tragedy",
213 | 					"divorce",
214 | 					"jail",
215 | 					"poverty",
216 | 					"ugly",
217 | 					"cancer",
218 | 					"kill",
219 | 					"rotten",
220 | 					"vomit",
221 | 					"agony",
222 | 					"prison"
223 | 				]
224 | 			},
225 | 			"European names vs African American Names /  Pleasant3 vs Unpleasant3":{
226 | 				"X":[
227 | 					"Adam",
228 | 					"Harry",
229 | 					"Josh",
230 | 					"Roger",
231 | 					"Alan",
232 | 					"Frank",
233 | 					"Justin",
234 | 					"Ryan",
235 | 					"Andrew",
236 | 					"Jack",
237 | 					"Matthew",
238 | 					"Stephen",
239 | 					"Brad",
240 | 					"Greg",
241 | 					"Paul",
242 | 					"Jonathan",
243 | 					"Peter",
244 | 					"Amanda",
245 | 					"Courtney",
246 | 					"Heather",
247 | 					"Melanie",
248 | 					"Katie",
249 | 					"Betsy",
250 | 					"Kristin",
251 | 					"Nancy",
252 | 					"Stephanie",
253 | 					"Ellen",
254 | 					"Lauren",
255 | 					"Colleen",
256 | 					"Emily",
257 | 					"Megan",
258 | 					"Rachel"
259 | 				],
260 | 				"Y":[
261 | 					"Alonzo",
262 | 					"Jamel",
263 | 					"Theo",
264 | 					"Alphonse",
265 | 					"Jerome",
266 | 					"Leroy",
267 | 					"Torrance",
268 | 					"Darnell",
269 | 					"Lamar",
270 | 					"Lionel",
271 | 					"Tyree",
272 | 					"Deion",
273 | 					"Lamont",
274 | 					"Malik",
275 | 					"Terrence",
276 | 					"Tyrone",
277 | 					"Lavon",
278 | 					"Marcellus",
279 | 					"Wardell",
280 | 					"Nichelle",
281 | 					"Shereen",
282 | 					"Ebony",
283 | 					"Latisha",
284 | 					"Shaniqua",
285 | 					"Jasmine",
286 | 					"Tanisha",
287 | 					"Tia",
288 | 					"Lakisha",
289 | 					"Latoya",
290 | 					"Yolanda",
291 | 					"Malika",
292 | 					"Yvette"
293 | 				],
294 | 				"A":[
295 | 					"caress",
296 | 					"freedom",
297 | 					"health",
298 | 					"love",
299 | 					"peace",
300 | 					"cheer",
301 | 					"friend",
302 | 					"heaven",
303 | 					"loyal",
304 | 					"pleasure",
305 | 					"diamond",
306 | 					"gentle",
307 | 					"honest",
308 | 					"lucky",
309 | 					"rainbow",
310 | 					"diploma",
311 | 					"gift",
312 | 					"honor",
313 | 					"miracle",
314 | 					"sunrise",
315 | 					"family",
316 | 					"happy",
317 | 					"laughter",
318 | 					"paradise",
319 | 					"vacation"
320 | 				],
321 | 				"B":[
322 | 					"abuse",
323 | 					"crash",
324 | 					"filth",
325 | 					"murder",
326 | 					"sickness",
327 | 					"accident",
328 | 					"death",
329 | 					"grief",
330 | 					"poison",
331 | 					"stink",
332 | 					"assault",
333 | 					"disaster",
334 | 					"hatred",
335 | 					"pollute",
336 | 					"tragedy",
337 | 					"bomb",
338 | 					"divorce",
339 | 					"jail",
340 | 					"poverty",
341 | 					"ugly",
342 | 					"cancer",
343 | 					"evil",
344 | 					"kill",
345 | 					"rotten",
346 | 					"vomit"
347 | 				]
348 | 			},
349 | 			"Male vs Female /  Career vs Family":{
350 | 				"X":[
351 | 					"John",
352 | 					"Paul",
353 | 					"Mike",
354 | 					"Kevin",
355 | 					"Steve",
356 | 					"Greg",
357 | 					"Jeff",
358 | 					"Bill"
359 | 				],
360 | 				"Y":[
361 | 					"Amy",
362 | 					"Joan",
363 | 					"Lisa",
364 | 					"Sarah",
365 | 					"Diana",
366 | 					"Kate",
367 | 					"Ann",
368 | 					"Donna"
369 | 				],
370 | 				"A":[
371 | 					"executive",
372 | 					"management",
373 | 					"professional",
374 | 					"corporation",
375 | 					"salary",
376 | 					"office",
377 | 					"business",
378 | 					"career"
379 | 				],
380 | 				"B":[
381 | 					"home",
382 | 					"parents",
383 | 					"children",
384 | 					"family",
385 | 					"cousins",
386 | 					"marriage",
387 | 					"wedding",
388 | 					"relatives"
389 | 				]
390 | 			},
391 | 			"Math vs Art / Male vs Female":{
392 | 				"X":[
393 | 					"math",
394 | 					"algebra",
395 | 					"geometry",
396 | 					"calculus",
397 | 					"equations",
398 | 					"computation",
399 | 					"numbers",
400 | 					"addition"
401 | 				],
402 | 				"Y":[
403 | 					"poetry",
404 | 					"art",
405 | 					"Shakespeare",
406 | 					"dance",
407 | 					"literature",
408 | 					"novel",
409 | 					"symphony",
410 | 					"drama"
411 | 				],
412 | 				"A":[
413 | 					"brother", "father", "uncle", "grandfather", "son", "he", "his", "him"
414 | 				],
415 | 				"B":[
416 | 					"sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"
417 | 				]
418 | 			},
419 | 			"Science vs Art / Male vs Female":{
420 | 				"X":[
421 | 					"science",
422 | 					"technology",
423 | 					"physics",
424 | 					"chemistry",
425 | 					"Einstein",
426 | 					"NASA",
427 | 					"experiment",
428 | 					"astronomy"
429 | 				],
430 | 				"Y":[
431 | 					"poetry",
432 | 					"art",
433 | 					"Shakespeare",
434 | 					"dance",
435 | 					"literature",
436 | 					"novel",
437 | 					"symphony",
438 | 					"drama"
439 | 				],
440 | 				"A":[
441 | 					"brother", "father", "uncle", "grandfather", "son", "he", "his", "him"
442 | 				],
443 | 				"B":[
444 | 					"sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"
445 | 				]
446 | 			}
447 | 		}
448 | 	}
449 | 


--------------------------------------------------------------------------------
/configs/compare_embeddings.json:
--------------------------------------------------------------------------------
  1 | 	{ "embeddings" : {"Reddit": "embeddings/model_full_reddit", "Google News": "embeddings/GoogleNews-vectors-negative300.bin"},
  2 | 		"tests":
  3 | 		{
  4 | 			"Flowers vs Insects (25) / Pleasant vs Unpleasant (25)":{
  5 | 				"targets_desc":"Flowers vs Insects (25)",
  6 | 				"attributes_desc":"Pleasant vs Unpleasant (25)",
  7 | 				"X":[
  8 | 					"aster",
  9 | 					"clover",
 10 | 					"hyacinth",
 11 | 					"marigold",
 12 | 					"poppy",
 13 | 					"azalea",
 14 | 					"crocus",
 15 | 					"iris",
 16 | 					"orchid",
 17 | 					"rose",
 18 | 					"bluebell",
 19 | 					"daffodil",
 20 | 					"lilac",
 21 | 					"pansy",
 22 | 					"tulip",
 23 | 					"buttercup",
 24 | 					"daisy",
 25 | 					"lily",
 26 | 					"peony",
 27 | 					"violet",
 28 | 					"carnation",
 29 | 					"gladiola",
 30 | 					"magnolia",
 31 | 					"petunia",
 32 | 					"zinnia"
 33 | 				],
 34 | 				"Y":[
 35 | 					"ant",
 36 | 					"caterpillar",
 37 | 					"flea",
 38 | 					"locust",
 39 | 					"spider",
 40 | 					"bedbug",
 41 | 					"centipede",
 42 | 					"fly",
 43 | 					"maggot",
 44 | 					"tarantula",
 45 | 					"bee",
 46 | 					"cockroach",
 47 | 					"gnat",
 48 | 					"mosquito",
 49 | 					"termite",
 50 | 					"beetle",
 51 | 					"cricket",
 52 | 					"hornet",
 53 | 					"moth",
 54 | 					"wasp",
 55 | 					"blackfly",
 56 | 					"dragonfly",
 57 | 					"horsefly",
 58 | 					"roach",
 59 | 					"weevil"
 60 | 				],
 61 | 				"A":[
 62 | 					"caress",
 63 | 					"freedom",
 64 | 					"health",
 65 | 					"love",
 66 | 					"peace",
 67 | 					"cheer",
 68 | 					"friend",
 69 | 					"heaven",
 70 | 					"loyal",
 71 | 					"pleasure",
 72 | 					"diamond",
 73 | 					"gentle",
 74 | 					"honest",
 75 | 					"lucky",
 76 | 					"rainbow",
 77 | 					"diploma",
 78 | 					"gift",
 79 | 					"honor",
 80 | 					"miracle",
 81 | 					"sunrise",
 82 | 					"family",
 83 | 					"happy",
 84 | 					"laughter",
 85 | 					"paradise",
 86 | 					"vacation"
 87 | 				],
 88 | 				"B":[
 89 | 					"abuse",
 90 | 					"crash",
 91 | 					"filth",
 92 | 					"murder",
 93 | 					"sickness",
 94 | 					"accident",
 95 | 					"death",
 96 | 					"grief",
 97 | 					"poison",
 98 | 					"stink",
 99 | 					"assault",
100 | 					"disaster",
101 | 					"hatred",
102 | 					"pollute",
103 | 					"tragedy",
104 | 					"divorce",
105 | 					"jail",
106 | 					"poverty",
107 | 					"ugly",
108 | 					"cancer",
109 | 					"kill",
110 | 					"rotten",
111 | 					"vomit",
112 | 					"agony",
113 | 					"prison"
114 | 				]
115 | 			},
116 | 			"Instruments vs Weapons /  Pleasant vs Unpleasant":{
117 | 				"X":[
118 | 					"bagpipe",
119 | 					"cello",
120 | 					"guitar",
121 | 					"lute",
122 | 					"trombone",
123 | 					"banjo",
124 | 					"clarinet",
125 | 					"harmonica",
126 | 					"mandolin",
127 | 					"trumpet",
128 | 					"bassoon",
129 | 					"drum",
130 | 					"harp",
131 | 					"oboe",
132 | 					"tuba",
133 | 					"bell",
134 | 					"fiddle",
135 | 					"harpsichord",
136 | 					"piano",
137 | 					"viola",
138 | 					"bongo",
139 | 					"flute",
140 | 					"horn",
141 | 					"saxophone",
142 | 					"violin"
143 | 				],
144 | 				"Y":[
145 | 					"arrow",
146 | 					"club",
147 | 					"gun",
148 | 					"missile",
149 | 					"spear",
150 | 					"ax",
151 | 					"dagger",
152 | 					"harpoon",
153 | 					"pistol",
154 | 					"sword",
155 | 					"blade",
156 | 					"dynamite",
157 | 					"hatchet",
158 | 					"rifle",
159 | 					"tank",
160 | 					"bomb",
161 | 					"firearm",
162 | 					"knife",
163 | 					"shotgun",
164 | 					"teargas",
165 | 					"cannon",
166 | 					"grenade",
167 | 					"mace",
168 | 					"slingshot",
169 | 					"whip"
170 | 				],
171 | 				"A":[
172 | 					"caress",
173 | 					"freedom",
174 | 					"health",
175 | 					"love",
176 | 					"peace",
177 | 					"cheer",
178 | 					"friend",
179 | 					"heaven",
180 | 					"loyal",
181 | 					"pleasure",
182 | 					"diamond",
183 | 					"gentle",
184 | 					"honest",
185 | 					"lucky",
186 | 					"rainbow",
187 | 					"diploma",
188 | 					"gift",
189 | 					"honor",
190 | 					"miracle",
191 | 					"sunrise",
192 | 					"family",
193 | 					"happy",
194 | 					"laughter",
195 | 					"paradise",
196 | 					"vacation"
197 | 				],
198 | 				"B":[
199 | 					"abuse",
200 | 					"crash",
201 | 					"filth",
202 | 					"murder",
203 | 					"sickness",
204 | 					"accident",
205 | 					"death",
206 | 					"grief",
207 | 					"poison",
208 | 					"stink",
209 | 					"assault",
210 | 					"disaster",
211 | 					"hatred",
212 | 					"pollute",
213 | 					"tragedy",
214 | 					"divorce",
215 | 					"jail",
216 | 					"poverty",
217 | 					"ugly",
218 | 					"cancer",
219 | 					"kill",
220 | 					"rotten",
221 | 					"vomit",
222 | 					"agony",
223 | 					"prison"
224 | 				]
225 | 			},
226 | 			"European names vs African American Names /  Pleasant3 vs Unpleasant3":{
227 | 				"X":[
228 | 					"Adam",
229 | 					"Harry",
230 | 					"Josh",
231 | 					"Roger",
232 | 					"Alan",
233 | 					"Frank",
234 | 					"Justin",
235 | 					"Ryan",
236 | 					"Andrew",
237 | 					"Jack",
238 | 					"Matthew",
239 | 					"Stephen",
240 | 					"Brad",
241 | 					"Greg",
242 | 					"Paul",
243 | 					"Jonathan",
244 | 					"Peter",
245 | 					"Amanda",
246 | 					"Courtney",
247 | 					"Heather",
248 | 					"Melanie",
249 | 					"Katie",
250 | 					"Betsy",
251 | 					"Kristin",
252 | 					"Nancy",
253 | 					"Stephanie",
254 | 					"Ellen",
255 | 					"Lauren",
256 | 					"Colleen",
257 | 					"Emily",
258 | 					"Megan",
259 | 					"Rachel"
260 | 				],
261 | 				"Y":[
262 | 					"Alonzo",
263 | 					"Jamel",
264 | 					"Theo",
265 | 					"Alphonse",
266 | 					"Jerome",
267 | 					"Leroy",
268 | 					"Torrance",
269 | 					"Darnell",
270 | 					"Lamar",
271 | 					"Lionel",
272 | 					"Tyree",
273 | 					"Deion",
274 | 					"Lamont",
275 | 					"Malik",
276 | 					"Terrence",
277 | 					"Tyrone",
278 | 					"Lavon",
279 | 					"Marcellus",
280 | 					"Wardell",
281 | 					"Nichelle",
282 | 					"Shereen",
283 | 					"Ebony",
284 | 					"Latisha",
285 | 					"Shaniqua",
286 | 					"Jasmine",
287 | 					"Tanisha",
288 | 					"Tia",
289 | 					"Lakisha",
290 | 					"Latoya",
291 | 					"Yolanda",
292 | 					"Malika",
293 | 					"Yvette"
294 | 				],
295 | 				"A":[
296 | 					"caress",
297 | 					"freedom",
298 | 					"health",
299 | 					"love",
300 | 					"peace",
301 | 					"cheer",
302 | 					"friend",
303 | 					"heaven",
304 | 					"loyal",
305 | 					"pleasure",
306 | 					"diamond",
307 | 					"gentle",
308 | 					"honest",
309 | 					"lucky",
310 | 					"rainbow",
311 | 					"diploma",
312 | 					"gift",
313 | 					"honor",
314 | 					"miracle",
315 | 					"sunrise",
316 | 					"family",
317 | 					"happy",
318 | 					"laughter",
319 | 					"paradise",
320 | 					"vacation"
321 | 				],
322 | 				"B":[
323 | 					"abuse",
324 | 					"crash",
325 | 					"filth",
326 | 					"murder",
327 | 					"sickness",
328 | 					"accident",
329 | 					"death",
330 | 					"grief",
331 | 					"poison",
332 | 					"stink",
333 | 					"assault",
334 | 					"disaster",
335 | 					"hatred",
336 | 					"pollute",
337 | 					"tragedy",
338 | 					"bomb",
339 | 					"divorce",
340 | 					"jail",
341 | 					"poverty",
342 | 					"ugly",
343 | 					"cancer",
344 | 					"evil",
345 | 					"kill",
346 | 					"rotten",
347 | 					"vomit"
348 | 				]
349 | 			},
350 | 			"Male vs Female /  Career vs Family":{
351 | 				"X":[
352 | 					"John",
353 | 					"Paul",
354 | 					"Mike",
355 | 					"Kevin",
356 | 					"Steve",
357 | 					"Greg",
358 | 					"Jeff",
359 | 					"Bill"
360 | 				],
361 | 				"Y":[
362 | 					"Amy",
363 | 					"Joan",
364 | 					"Lisa",
365 | 					"Sarah",
366 | 					"Diana",
367 | 					"Kate",
368 | 					"Ann",
369 | 					"Donna"
370 | 				],
371 | 				"A":[
372 | 					"executive",
373 | 					"management",
374 | 					"professional",
375 | 					"corporation",
376 | 					"salary",
377 | 					"office",
378 | 					"business",
379 | 					"career"
380 | 				],
381 | 				"B":[
382 | 					"home",
383 | 					"parents",
384 | 					"children",
385 | 					"family",
386 | 					"cousins",
387 | 					"marriage",
388 | 					"wedding",
389 | 					"relatives"
390 | 				]
391 | 			},
392 | 			"Math vs Art / Male8 vs Female8":{
393 | 				"X":[
394 | 					"math",
395 | 					"algebra",
396 | 					"geometry",
397 | 					"calculus",
398 | 					"equations",
399 | 					"computation",
400 | 					"numbers",
401 | 					"addition"
402 | 				],
403 | 				"Y":[
404 | 					"poetry",
405 | 					"art",
406 | 					"Shakespeare",
407 | 					"dance",
408 | 					"literature",
409 | 					"novel",
410 | 					"symphony",
411 | 					"drama"
412 | 				],
413 | 				"A":[
414 | 					"brother", "father", "uncle", "grandfather", "son", "he", "his", "him"
415 | 				],
416 | 				"B":[
417 | 					"sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"
418 | 				]
419 | 			},
420 | 			"Science vs Art / Male8 vs Female8":{
421 | 				"X":[
422 | 					"science",
423 | 					"technology",
424 | 					"physics",
425 | 					"chemistry",
426 | 					"Einstein",
427 | 					"NASA",
428 | 					"experiment",
429 | 					"astronomy"
430 | 				],
431 | 				"Y":[
432 | 					"poetry",
433 | 					"art",
434 | 					"Shakespeare",
435 | 					"dance",
436 | 					"literature",
437 | 					"novel",
438 | 					"symphony",
439 | 					"drama"
440 | 				],
441 | 				"A":[
442 | 					"brother", "father", "uncle", "grandfather", "son", "he", "his", "him"
443 | 				],
444 | 				"B":[
445 | 					"sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"
446 | 				]
447 | 			},
448 | 			"Straight vs Gay / Good vs Bad":{
449 | 				"X":[
450 | 					"straight",
451 | 					"heterosexual"
452 | 				],
453 | 				"Y":[
454 | 					"gay",
455 | 					"homosexual"
456 | 				],
457 | 				"A":[
458 | 					"beautiful",
459 | 					"superb",
460 | 					"joyful",
461 | 					"lovely",
462 | 					"glorious",
463 | 					"pleasure",
464 | 					"marvelous",
465 | 					"wonderful"
466 | 				],
467 | 				"B":[
468 | 					"humiliate",
469 | 					"agony",
470 | 					"awful",
471 | 					"nasty",
472 | 					"terrible",
473 | 					"horrible",
474 | 					"tragic",
475 | 					"painful"
476 | 				]
477 | 			},
478 | 			"Arab + Muslim vs Other /  Good vs Bad":{
479 | 				"X":[
480 | 					"Ernesto",
481 | 					"Matthais",
482 | 					"Maarten",
483 | 					"Philippe",
484 | 					"Guillame",
485 | 					"Benoit",
486 | 					"Takuya",
487 | 					"Kazuki",
488 | 					"Chaiyo",
489 | 					"Marcelo"
490 | 				],
491 | 				"Y":[
492 | 					"hakim",
493 | 					"sharif",
494 | 					"yousef",
495 | 					"wahib",
496 | 					"muhsin",
497 | 					"salim",
498 | 					"karim",
499 | 					"habib",
500 | 					"ashraf",
501 | 					"akbar"
502 | 				],
503 | 				"A":[
504 | 					"laughter",
505 | 					"happy",
506 | 					"joy",
507 | 					"love",
508 | 					"glorious",
509 | 					"pleasure",
510 | 					"peace",
511 | 					"wonderful"
512 | 				],
513 | 				"B":[
514 | 					"failure",
515 | 					"agony",
516 | 					"awful",
517 | 					"nasty",
518 | 					"terrible",
519 | 					"horrible",
520 | 					"nasty",
521 | 					"evil"
522 | 				]
523 | 			},
524 | 			"Christianity vs Judaism /  Good vs Bad":{
525 | 				"X":[
526 | 					"church",
527 | 					"bible",
528 | 					"christian",
529 | 					"christianity"
530 | 				],
531 | 				"Y":[
532 | 					"synagogue",
533 | 					"torah",
534 | 					"jew",
535 | 					"judaism"
536 | 				],
537 | 				"A":[
538 | 					"love",
539 | 					"cheer",
540 | 					"friend",
541 | 					"pleasure",
542 | 					"adore",
543 | 					"cheerful",
544 | 					"friendship",
545 | 					"joyful",
546 | 					"smiling",
547 | 					"cherish",
548 | 					"excellent",
549 | 					"glad",
550 | 					"joyous",
551 | 					"spectacular",
552 | 					"appealing",
553 | 					"delight",
554 | 					"excitement",
555 | 					"laughing",
556 | 					"attractive",
557 | 					"delightful",
558 | 					"fabulous",
559 | 					"glorious",
560 | 					"pleasing",
561 | 					"beautiful",
562 | 					"fantastic",
563 | 					"happy",
564 | 					"lovely",
565 | 					"terrific",
566 | 					"celebrate",
567 | 					"enjoy",
568 | 					"magnificent",
569 | 					"triumph"
570 | 				],
571 | 				"B":[
572 | 					"abuse",
573 | 					"grief",
574 | 					"poison",
575 | 					"sadness",
576 | 					"pain",
577 | 					"despise",
578 | 					"failure",
579 | 					"nasty",
580 | 					"angry",
581 | 					"detest",
582 | 					"horrible",
583 | 					"negative",
584 | 					"ugly",
585 | 					"dirty",
586 | 					"gross",
587 | 					"evil",
588 | 					"rotten",
589 | 					"annoy",
590 | 					"disaster",
591 | 					"horrific",
592 | 					"scorn",
593 | 					"awful",
594 | 					"disgust",
595 | 					"hate",
596 | 					"humiliate",
597 | 					"selfish",
598 | 					"tragic",
599 | 					"bothersome",
600 | 					"hatred",
601 | 					"hurtful",
602 | 					"sickening",
603 | 					"yucky"
604 | 				]
605 | 			},
606 | 			"Christianity vs Islam /  Good vs Bad":{
607 | 				"X":[
608 | 					"church",
609 | 					"bible",
610 | 					"christian",
611 | 					"christianity"
612 | 				],
613 | 				"Y":[
614 | 					"mosque",
615 | 					"koran",
616 | 					"muslim",
617 | 					"islam"
618 | 				],
619 | 				"A":[
620 | 					"love",
621 | 					"cheer",
622 | 					"friend",
623 | 					"pleasure",
624 | 					"adore",
625 | 					"cheerful",
626 | 					"friendship",
627 | 					"joyful",
628 | 					"smiling",
629 | 					"cherish",
630 | 					"excellent",
631 | 					"glad",
632 | 					"joyous",
633 | 					"spectacular",
634 | 					"appealing",
635 | 					"delight",
636 | 					"excitement",
637 | 					"laughing",
638 | 					"attractive",
639 | 					"delightful",
640 | 					"fabulous",
641 | 					"glorious",
642 | 					"pleasing",
643 | 					"beautiful",
644 | 					"fantastic",
645 | 					"happy",
646 | 					"lovely",
647 | 					"terrific",
648 | 					"celebrate",
649 | 					"enjoy",
650 | 					"magnificent",
651 | 					"triumph"
652 | 				],
653 | 				"B":[
654 | 					"abuse",
655 | 					"grief",
656 | 					"poison",
657 | 					"sadness",
658 | 					"pain",
659 | 					"despise",
660 | 					"failure",
661 | 					"nasty",
662 | 					"angry",
663 | 					"detest",
664 | 					"horrible",
665 | 					"negative",
666 | 					"ugly",
667 | 					"dirty",
668 | 					"gross",
669 | 					"evil",
670 | 					"rotten",
671 | 					"annoy",
672 | 					"disaster",
673 | 					"horrific",
674 | 					"scorn",
675 | 					"awful",
676 | 					"disgust",
677 | 					"hate",
678 | 					"humiliate",
679 | 					"selfish",
680 | 					"tragic",
681 | 					"bothersome",
682 | 					"hatred",
683 | 					"hurtful",
684 | 					"sickening",
685 | 					"yucky"
686 | 				]
687 | 			},
688 | 			"Judaism vs Islam /  Good vs Bad":{
689 | 				"X":[
690 | 					"synagogue",
691 | 					"torah",
692 | 					"jew",
693 | 					"judaism"
694 | 				],
695 | 				"Y":[
696 | 					"mosque",
697 | 					"koran",
698 | 					"muslim",
699 | 					"islam"
700 | 				],
701 | 				"A":[
702 | 					"love",
703 | 					"cheer",
704 | 					"friend",
705 | 					"pleasure",
706 | 					"adore",
707 | 					"cheerful",
708 | 					"friendship",
709 | 					"joyful",
710 | 					"smiling",
711 | 					"cherish",
712 | 					"excellent",
713 | 					"glad",
714 | 					"joyous",
715 | 					"spectacular",
716 | 					"appealing",
717 | 					"delight",
718 | 					"excitement",
719 | 					"laughing",
720 | 					"attractive",
721 | 					"delightful",
722 | 					"fabulous",
723 | 					"glorious",
724 | 					"pleasing",
725 | 					"beautiful",
726 | 					"fantastic",
727 | 					"happy",
728 | 					"lovely",
729 | 					"terrific",
730 | 					"celebrate",
731 | 					"enjoy",
732 | 					"magnificent",
733 | 					"triumph"
734 | 				],
735 | 				"B":[
736 | 					"abuse",
737 | 					"grief",
738 | 					"poison",
739 | 					"sadness",
740 | 					"pain",
741 | 					"despise",
742 | 					"failure",
743 | 					"nasty",
744 | 					"angry",
745 | 					"detest",
746 | 					"horrible",
747 | 					"negative",
748 | 					"ugly",
749 | 					"dirty",
750 | 					"gross",
751 | 					"evil",
752 | 					"rotten",
753 | 					"annoy",
754 | 					"disaster",
755 | 					"horrific",
756 | 					"scorn",
757 | 					"awful",
758 | 					"disgust",
759 | 					"hate",
760 | 					"humiliate",
761 | 					"selfish",
762 | 					"tragic",
763 | 					"bothersome",
764 | 					"hatred",
765 | 					"hurtful",
766 | 					"sickening",
767 | 					"yucky"
768 | 				]
769 | 			}
770 | 		}
771 | 	}
772 | 


--------------------------------------------------------------------------------
/configs/histwords.json:
--------------------------------------------------------------------------------
  1 | 	{ "embeddings" : {"Histwords":
  2 | 		{"1800" :  "embeddings/sgns-all-txts/1800.txt",
  3 | 			"1810" : "embeddings/sgns-all-txts/1810.txt",
  4 | 			"1820" : "embeddings/sgns-all-txts/1820.txt",
  5 | 			"1830" : "embeddings/sgns-all-txts/1830.txt",
  6 | 			"1840" : "embeddings/sgns-all-txts/1840.txt",
  7 | 			"1850" : "embeddings/sgns-all-txts/1850.txt",
  8 | 			"1860" : "embeddings/sgns-all-txts/1860.txt",
  9 | 			"1870" : "embeddings/sgns-all-txts/1870.txt",
 10 | 			"1880" : "embeddings/sgns-all-txts/1880.txt",
 11 | 			"1890" : "embeddings/sgns-all-txts/1890.txt",
 12 | 			"1900" : "embeddings/sgns-all-txts/1900.txt",
 13 | 			"1910" : "embeddings/sgns-all-txts/1910.txt",
 14 | 			"1920" : "embeddings/sgns-all-txts/1920.txt",
 15 | 			"1930" : "embeddings/sgns-all-txts/1930.txt",
 16 | 			"1940" : "embeddings/sgns-all-txts/1940.txt",
 17 | 			"1950" : "embeddings/sgns-all-txts/1950.txt",
 18 | 			"1960" : "embeddings/sgns-all-txts/1960.txt",
 19 | 			"1970" : "embeddings/sgns-all-txts/1970.txt",
 20 | 			"1980" : "embeddings/sgns-all-txts/1980.txt",
 21 | 			"1990" : "embeddings/sgns-all-txts/1990.txt"
 22 | 		}
 23 | 	},
 24 | 		"tests":
 25 | 		{
 26 | 			"Flowers vs Insects (25) / Pleasant vs Unpleasant (25)":{
 27 | 				"targets_desc":"Flowers vs Insects (25)",
 28 | 				"attributes_desc":"Pleasant vs Unpleasant (25)",
 29 | 				"X":[
 30 | 					"aster",
 31 | 					"clover",
 32 | 					"hyacinth",
 33 | 					"marigold",
 34 | 					"poppy",
 35 | 					"azalea",
 36 | 					"crocus",
 37 | 					"iris",
 38 | 					"orchid",
 39 | 					"rose",
 40 | 					"bluebell",
 41 | 					"daffodil",
 42 | 					"lilac",
 43 | 					"pansy",
 44 | 					"tulip",
 45 | 					"buttercup",
 46 | 					"daisy",
 47 | 					"lily",
 48 | 					"peony",
 49 | 					"violet",
 50 | 					"carnation",
 51 | 					"gladiola",
 52 | 					"magnolia",
 53 | 					"petunia",
 54 | 					"zinnia"
 55 | 				],
 56 | 				"Y":[
 57 | 					"ant",
 58 | 					"caterpillar",
 59 | 					"flea",
 60 | 					"locust",
 61 | 					"spider",
 62 | 					"bedbug",
 63 | 					"centipede",
 64 | 					"fly",
 65 | 					"maggot",
 66 | 					"tarantula",
 67 | 					"bee",
 68 | 					"cockroach",
 69 | 					"gnat",
 70 | 					"mosquito",
 71 | 					"termite",
 72 | 					"beetle",
 73 | 					"cricket",
 74 | 					"hornet",
 75 | 					"moth",
 76 | 					"wasp",
 77 | 					"blackfly",
 78 | 					"dragonfly",
 79 | 					"horsefly",
 80 | 					"roach",
 81 | 					"weevil"
 82 | 				],
 83 | 				"A":[
 84 | 					"caress",
 85 | 					"freedom",
 86 | 					"health",
 87 | 					"love",
 88 | 					"peace",
 89 | 					"cheer",
 90 | 					"friend",
 91 | 					"heaven",
 92 | 					"loyal",
 93 | 					"pleasure",
 94 | 					"diamond",
 95 | 					"gentle",
 96 | 					"honest",
 97 | 					"lucky",
 98 | 					"rainbow",
 99 | 					"diploma",
100 | 					"gift",
101 | 					"honor",
102 | 					"miracle",
103 | 					"sunrise",
104 | 					"family",
105 | 					"happy",
106 | 					"laughter",
107 | 					"paradise",
108 | 					"vacation"
109 | 				],
110 | 				"B":[
111 | 					"abuse",
112 | 					"crash",
113 | 					"filth",
114 | 					"murder",
115 | 					"sickness",
116 | 					"accident",
117 | 					"death",
118 | 					"grief",
119 | 					"poison",
120 | 					"stink",
121 | 					"assault",
122 | 					"disaster",
123 | 					"hatred",
124 | 					"pollute",
125 | 					"tragedy",
126 | 					"divorce",
127 | 					"jail",
128 | 					"poverty",
129 | 					"ugly",
130 | 					"cancer",
131 | 					"kill",
132 | 					"rotten",
133 | 					"vomit",
134 | 					"agony",
135 | 					"prison"
136 | 				]
137 | 			},
138 | 			"Instruments vs Weapons /  Pleasant vs Unpleasant":{
139 | 				"X":[
140 | 					"bagpipe",
141 | 					"cello",
142 | 					"guitar",
143 | 					"lute",
144 | 					"trombone",
145 | 					"banjo",
146 | 					"clarinet",
147 | 					"harmonica",
148 | 					"mandolin",
149 | 					"trumpet",
150 | 					"bassoon",
151 | 					"drum",
152 | 					"harp",
153 | 					"oboe",
154 | 					"tuba",
155 | 					"bell",
156 | 					"fiddle",
157 | 					"harpsichord",
158 | 					"piano",
159 | 					"viola",
160 | 					"bongo",
161 | 					"flute",
162 | 					"horn",
163 | 					"saxophone",
164 | 					"violin"
165 | 				],
166 | 				"Y":[
167 | 					"arrow",
168 | 					"club",
169 | 					"gun",
170 | 					"missile",
171 | 					"spear",
172 | 					"ax",
173 | 					"dagger",
174 | 					"harpoon",
175 | 					"pistol",
176 | 					"sword",
177 | 					"blade",
178 | 					"dynamite",
179 | 					"hatchet",
180 | 					"rifle",
181 | 					"tank",
182 | 					"bomb",
183 | 					"firearm",
184 | 					"knife",
185 | 					"shotgun",
186 | 					"teargas",
187 | 					"cannon",
188 | 					"grenade",
189 | 					"mace",
190 | 					"slingshot",
191 | 					"whip"
192 | 				],
193 | 				"A":[
194 | 					"caress",
195 | 					"freedom",
196 | 					"health",
197 | 					"love",
198 | 					"peace",
199 | 					"cheer",
200 | 					"friend",
201 | 					"heaven",
202 | 					"loyal",
203 | 					"pleasure",
204 | 					"diamond",
205 | 					"gentle",
206 | 					"honest",
207 | 					"lucky",
208 | 					"rainbow",
209 | 					"diploma",
210 | 					"gift",
211 | 					"honor",
212 | 					"miracle",
213 | 					"sunrise",
214 | 					"family",
215 | 					"happy",
216 | 					"laughter",
217 | 					"paradise",
218 | 					"vacation"
219 | 				],
220 | 				"B":[
221 | 					"abuse",
222 | 					"crash",
223 | 					"filth",
224 | 					"murder",
225 | 					"sickness",
226 | 					"accident",
227 | 					"death",
228 | 					"grief",
229 | 					"poison",
230 | 					"stink",
231 | 					"assault",
232 | 					"disaster",
233 | 					"hatred",
234 | 					"pollute",
235 | 					"tragedy",
236 | 					"divorce",
237 | 					"jail",
238 | 					"poverty",
239 | 					"ugly",
240 | 					"cancer",
241 | 					"kill",
242 | 					"rotten",
243 | 					"vomit",
244 | 					"agony",
245 | 					"prison"
246 | 				]
247 | 			},
248 | 			"European names vs African American Names /  Pleasant3 vs Unpleasant3":{
249 | 				"X":[
250 | 					"Adam",
251 | 					"Harry",
252 | 					"Josh",
253 | 					"Roger",
254 | 					"Alan",
255 | 					"Frank",
256 | 					"Justin",
257 | 					"Ryan",
258 | 					"Andrew",
259 | 					"Jack",
260 | 					"Matthew",
261 | 					"Stephen",
262 | 					"Brad",
263 | 					"Greg",
264 | 					"Paul",
265 | 					"Jonathan",
266 | 					"Peter",
267 | 					"Amanda",
268 | 					"Courtney",
269 | 					"Heather",
270 | 					"Melanie",
271 | 					"Katie",
272 | 					"Betsy",
273 | 					"Kristin",
274 | 					"Nancy",
275 | 					"Stephanie",
276 | 					"Ellen",
277 | 					"Lauren",
278 | 					"Colleen",
279 | 					"Emily",
280 | 					"Megan",
281 | 					"Rachel"
282 | 				],
283 | 				"Y":[
284 | 					"Alonzo",
285 | 					"Jamel",
286 | 					"Theo",
287 | 					"Alphonse",
288 | 					"Jerome",
289 | 					"Leroy",
290 | 					"Torrance",
291 | 					"Darnell",
292 | 					"Lamar",
293 | 					"Lionel",
294 | 					"Tyree",
295 | 					"Deion",
296 | 					"Lamont",
297 | 					"Malik",
298 | 					"Terrence",
299 | 					"Tyrone",
300 | 					"Lavon",
301 | 					"Marcellus",
302 | 					"Wardell",
303 | 					"Nichelle",
304 | 					"Shereen",
305 | 					"Ebony",
306 | 					"Latisha",
307 | 					"Shaniqua",
308 | 					"Jasmine",
309 | 					"Tanisha",
310 | 					"Tia",
311 | 					"Lakisha",
312 | 					"Latoya",
313 | 					"Yolanda",
314 | 					"Malika",
315 | 					"Yvette"
316 | 				],
317 | 				"A":[
318 | 					"caress",
319 | 					"freedom",
320 | 					"health",
321 | 					"love",
322 | 					"peace",
323 | 					"cheer",
324 | 					"friend",
325 | 					"heaven",
326 | 					"loyal",
327 | 					"pleasure",
328 | 					"diamond",
329 | 					"gentle",
330 | 					"honest",
331 | 					"lucky",
332 | 					"rainbow",
333 | 					"diploma",
334 | 					"gift",
335 | 					"honor",
336 | 					"miracle",
337 | 					"sunrise",
338 | 					"family",
339 | 					"happy",
340 | 					"laughter",
341 | 					"paradise",
342 | 					"vacation"
343 | 				],
344 | 				"B":[
345 | 					"abuse",
346 | 					"crash",
347 | 					"filth",
348 | 					"murder",
349 | 					"sickness",
350 | 					"accident",
351 | 					"death",
352 | 					"grief",
353 | 					"poison",
354 | 					"stink",
355 | 					"assault",
356 | 					"disaster",
357 | 					"hatred",
358 | 					"pollute",
359 | 					"tragedy",
360 | 					"bomb",
361 | 					"divorce",
362 | 					"jail",
363 | 					"poverty",
364 | 					"ugly",
365 | 					"cancer",
366 | 					"evil",
367 | 					"kill",
368 | 					"rotten",
369 | 					"vomit"
370 | 				]
371 | 			},
372 | 			"Male vs Female /  Career vs Family":{
373 | 				"X":[
374 | 					"John",
375 | 					"Paul",
376 | 					"Mike",
377 | 					"Kevin",
378 | 					"Steve",
379 | 					"Greg",
380 | 					"Jeff",
381 | 					"Bill"
382 | 				],
383 | 				"Y":[
384 | 					"Amy",
385 | 					"Joan",
386 | 					"Lisa",
387 | 					"Sarah",
388 | 					"Diana",
389 | 					"Kate",
390 | 					"Ann",
391 | 					"Donna"
392 | 				],
393 | 				"A":[
394 | 					"executive",
395 | 					"management",
396 | 					"professional",
397 | 					"corporation",
398 | 					"salary",
399 | 					"office",
400 | 					"business",
401 | 					"career"
402 | 				],
403 | 				"B":[
404 | 					"home",
405 | 					"parents",
406 | 					"children",
407 | 					"family",
408 | 					"cousins",
409 | 					"marriage",
410 | 					"wedding",
411 | 					"relatives"
412 | 				]
413 | 			},
414 | 			"Math vs Art / Male8 vs Female8":{
415 | 				"X":[
416 | 					"math",
417 | 					"algebra",
418 | 					"geometry",
419 | 					"calculus",
420 | 					"equations",
421 | 					"computation",
422 | 					"numbers",
423 | 					"addition"
424 | 				],
425 | 				"Y":[
426 | 					"poetry",
427 | 					"art",
428 | 					"Shakespeare",
429 | 					"dance",
430 | 					"literature",
431 | 					"novel",
432 | 					"symphony",
433 | 					"drama"
434 | 				],
435 | 				"A":[
436 | 					"brother", "father", "uncle", "grandfather", "son", "he", "his", "him"
437 | 				],
438 | 				"B":[
439 | 					"sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"
440 | 				]
441 | 			},
442 | 			"Science vs Art / Male8 vs Female8":{
443 | 				"X":[
444 | 					"science",
445 | 					"technology",
446 | 					"physics",
447 | 					"chemistry",
448 | 					"Einstein",
449 | 					"NASA",
450 | 					"experiment",
451 | 					"astronomy"
452 | 				],
453 | 				"Y":[
454 | 					"poetry",
455 | 					"art",
456 | 					"Shakespeare",
457 | 					"dance",
458 | 					"literature",
459 | 					"novel",
460 | 					"symphony",
461 | 					"drama"
462 | 				],
463 | 				"A":[
464 | 					"brother", "father", "uncle", "grandfather", "son", "he", "his", "him"
465 | 				],
466 | 				"B":[
467 | 					"sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"
468 | 				]
469 | 			},
470 | 			"Straight vs Gay / Good vs Bad":{
471 | 				"X":[
472 | 					"straight",
473 | 					"heterosexual"
474 | 				],
475 | 				"Y":[
476 | 					"gay",
477 | 					"homosexual"
478 | 				],
479 | 				"A":[
480 | 					"beautiful",
481 | 					"superb",
482 | 					"joyful",
483 | 					"lovely",
484 | 					"glorious",
485 | 					"pleasure",
486 | 					"marvelous",
487 | 					"wonderful"
488 | 				],
489 | 				"B":[
490 | 					"humiliate",
491 | 					"agony",
492 | 					"awful",
493 | 					"nasty",
494 | 					"terrible",
495 | 					"horrible",
496 | 					"tragic",
497 | 					"painful"
498 | 				]
499 | 			},
500 | 			"Arab + Muslim vs Other /  Good vs Bad":{
501 | 				"X":[
502 | 					"Ernesto",
503 | 					"Matthais",
504 | 					"Maarten",
505 | 					"Philippe",
506 | 					"Guillame",
507 | 					"Benoit",
508 | 					"Takuya",
509 | 					"Kazuki",
510 | 					"Chaiyo",
511 | 					"Marcelo"
512 | 				],
513 | 				"Y":[
514 | 					"hakim",
515 | 					"sharif",
516 | 					"yousef",
517 | 					"wahib",
518 | 					"muhsin",
519 | 					"salim",
520 | 					"karim",
521 | 					"habib",
522 | 					"ashraf",
523 | 					"akbar"
524 | 				],
525 | 				"A":[
526 | 					"laughter",
527 | 					"happy",
528 | 					"joy",
529 | 					"love",
530 | 					"glorious",
531 | 					"pleasure",
532 | 					"peace",
533 | 					"wonderful"
534 | 				],
535 | 				"B":[
536 | 					"failure",
537 | 					"agony",
538 | 					"awful",
539 | 					"nasty",
540 | 					"terrible",
541 | 					"horrible",
542 | 					"nasty",
543 | 					"evil"
544 | 				]
545 | 			},
546 | 			"Christianity vs Judaism /  Good vs Bad":{
547 | 				"X":[
548 | 					"church",
549 | 					"bible",
550 | 					"christian",
551 | 					"christianity"
552 | 				],
553 | 				"Y":[
554 | 					"synagogue",
555 | 					"torah",
556 | 					"jew",
557 | 					"judaism"
558 | 				],
559 | 				"A":[
560 | 					"love",
561 | 					"cheer",
562 | 					"friend",
563 | 					"pleasure",
564 | 					"adore",
565 | 					"cheerful",
566 | 					"friendship",
567 | 					"joyful",
568 | 					"smiling",
569 | 					"cherish",
570 | 					"excellent",
571 | 					"glad",
572 | 					"joyous",
573 | 					"spectacular",
574 | 					"appealing",
575 | 					"delight",
576 | 					"excitement",
577 | 					"laughing",
578 | 					"attractive",
579 | 					"delightful",
580 | 					"fabulous",
581 | 					"glorious",
582 | 					"pleasing",
583 | 					"beautiful",
584 | 					"fantastic",
585 | 					"happy",
586 | 					"lovely",
587 | 					"terrific",
588 | 					"celebrate",
589 | 					"enjoy",
590 | 					"magnificent",
591 | 					"triumph"
592 | 				],
593 | 				"B":[
594 | 					"abuse",
595 | 					"grief",
596 | 					"poison",
597 | 					"sadness",
598 | 					"pain",
599 | 					"despise",
600 | 					"failure",
601 | 					"nasty",
602 | 					"angry",
603 | 					"detest",
604 | 					"horrible",
605 | 					"negative",
606 | 					"ugly",
607 | 					"dirty",
608 | 					"gross",
609 | 					"evil",
610 | 					"rotten",
611 | 					"annoy",
612 | 					"disaster",
613 | 					"horrific",
614 | 					"scorn",
615 | 					"awful",
616 | 					"disgust",
617 | 					"hate",
618 | 					"humiliate",
619 | 					"selfish",
620 | 					"tragic",
621 | 					"bothersome",
622 | 					"hatred",
623 | 					"hurtful",
624 | 					"sickening",
625 | 					"yucky"
626 | 				]
627 | 			},
628 | 			"Christianity vs Islam /  Good vs Bad":{
629 | 				"X":[
630 | 					"church",
631 | 					"bible",
632 | 					"christian",
633 | 					"christianity"
634 | 				],
635 | 				"Y":[
636 | 					"mosque",
637 | 					"koran",
638 | 					"muslim",
639 | 					"islam"
640 | 				],
641 | 				"A":[
642 | 					"love",
643 | 					"cheer",
644 | 					"friend",
645 | 					"pleasure",
646 | 					"adore",
647 | 					"cheerful",
648 | 					"friendship",
649 | 					"joyful",
650 | 					"smiling",
651 | 					"cherish",
652 | 					"excellent",
653 | 					"glad",
654 | 					"joyous",
655 | 					"spectacular",
656 | 					"appealing",
657 | 					"delight",
658 | 					"excitement",
659 | 					"laughing",
660 | 					"attractive",
661 | 					"delightful",
662 | 					"fabulous",
663 | 					"glorious",
664 | 					"pleasing",
665 | 					"beautiful",
666 | 					"fantastic",
667 | 					"happy",
668 | 					"lovely",
669 | 					"terrific",
670 | 					"celebrate",
671 | 					"enjoy",
672 | 					"magnificent",
673 | 					"triumph"
674 | 				],
675 | 				"B":[
676 | 					"abuse",
677 | 					"grief",
678 | 					"poison",
679 | 					"sadness",
680 | 					"pain",
681 | 					"despise",
682 | 					"failure",
683 | 					"nasty",
684 | 					"angry",
685 | 					"detest",
686 | 					"horrible",
687 | 					"negative",
688 | 					"ugly",
689 | 					"dirty",
690 | 					"gross",
691 | 					"evil",
692 | 					"rotten",
693 | 					"annoy",
694 | 					"disaster",
695 | 					"horrific",
696 | 					"scorn",
697 | 					"awful",
698 | 					"disgust",
699 | 					"hate",
700 | 					"humiliate",
701 | 					"selfish",
702 | 					"tragic",
703 | 					"bothersome",
704 | 					"hatred",
705 | 					"hurtful",
706 | 					"sickening",
707 | 					"yucky"
708 | 				]
709 | 			},
710 | 			"Judaism vs Islam /  Good vs Bad":{
711 | 				"X":[
712 | 					"synagogue",
713 | 					"torah",
714 | 					"jew",
715 | 					"judaism"
716 | 				],
717 | 				"Y":[
718 | 					"mosque",
719 | 					"koran",
720 | 					"muslim",
721 | 					"islam"
722 | 				],
723 | 				"A":[
724 | 					"love",
725 | 					"cheer",
726 | 					"friend",
727 | 					"pleasure",
728 | 					"adore",
729 | 					"cheerful",
730 | 					"friendship",
731 | 					"joyful",
732 | 					"smiling",
733 | 					"cherish",
734 | 					"excellent",
735 | 					"glad",
736 | 					"joyous",
737 | 					"spectacular",
738 | 					"appealing",
739 | 					"delight",
740 | 					"excitement",
741 | 					"laughing",
742 | 					"attractive",
743 | 					"delightful",
744 | 					"fabulous",
745 | 					"glorious",
746 | 					"pleasing",
747 | 					"beautiful",
748 | 					"fantastic",
749 | 					"happy",
750 | 					"lovely",
751 | 					"terrific",
752 | 					"celebrate",
753 | 					"enjoy",
754 | 					"magnificent",
755 | 					"triumph"
756 | 				],
757 | 				"B":[
758 | 					"abuse",
759 | 					"grief",
760 | 					"poison",
761 | 					"sadness",
762 | 					"pain",
763 | 					"despise",
764 | 					"failure",
765 | 					"nasty",
766 | 					"angry",
767 | 					"detest",
768 | 					"horrible",
769 | 					"negative",
770 | 					"ugly",
771 | 					"dirty",
772 | 					"gross",
773 | 					"evil",
774 | 					"rotten",
775 | 					"annoy",
776 | 					"disaster",
777 | 					"horrific",
778 | 					"scorn",
779 | 					"awful",
780 | 					"disgust",
781 | 					"hate",
782 | 					"humiliate",
783 | 					"selfish",
784 | 					"tragic",
785 | 					"bothersome",
786 | 					"hatred",
787 | 					"hurtful",
788 | 					"sickening",
789 | 					"yucky"
790 | 				]
791 | 			}
792 | 		}
793 | 	}
794 | 


--------------------------------------------------------------------------------
/images/google_news_reddit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hljoren/compare-embedding-bias/7a185137c66d2a9e92a9d10c7e4550048b24ff8e/images/google_news_reddit.png


--------------------------------------------------------------------------------
/images/inst_weap_science_art.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hljoren/compare-embedding-bias/7a185137c66d2a9e92a9d10c7e4550048b24ff8e/images/inst_weap_science_art.png


--------------------------------------------------------------------------------
/read_config.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Read config file for WEAT test
 3 | '''
 4 | import sys
 5 | import json
 6 | import os
 7 | 
 8 | 
 9 | def read_json_config(file_name):
10 |     '''
11 |     Read a set of experiment configuration parameters from a JSON file,
12 |     and return a dictionary with those parameters.
13 | 
14 |     The JSON must two values:
15 | 
16 |     1. embeddings: list of EITHER paths to all embeddings to compare OR nested
17 |     JSON containing embedding name then JSON mapping years to embedding paths
18 |     (used for time series data)
19 | 
20 |     2. tests: JSON of tests including test name, followed by a JSON representing
21 |     the particular test configuration. Each test must have X, Y, A, and B as keys.
22 | 
23 |     (3.) compare_tests: (OPTIONAL) If the experiment is a time series with multiple
24 |     tests and multiple embeddings, indicates whether to compare embeddings
25 |     (one graph per test) or to compare tests (one graph per emebedding).
26 |     Defaults to false, or the latter.
27 | 
28 |     :param file_name: Name of the file containing the configuration
29 |     :return: a dictionary with key the name of the experiment and value a dictionary representing
30 |     '''
31 |     with open(file_name) as json_file:
32 |         data = json.load(json_file)
33 | 
34 |         if 'embeddings' not in data:
35 |             print('Config must contain embedding_paths')
36 |             sys.exit()
37 |         elif 'tests' not in data:
38 |             print('Config must contain tests')
39 |             sys.exit()
40 |         for test_name, experiment_config in data['tests'].items():
41 |             for k in ['X','Y','A','B']:
42 |                 if k not in experiment_config:
43 |                     print('required key ' + k + ' not found in config')
44 |                     sys.exit()
45 | 
46 |     return data
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim>=3.7.2
2 | numpy>=1.16.3
3 | cPickle>=1.71
4 | json>=2.0.9
5 | scipy>=1.2.1
6 | 


--------------------------------------------------------------------------------
/results/caliskan.json:
--------------------------------------------------------------------------------
1 |   {"Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.54],
2 |     "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.63],
3 |     "European names vs African American Names /  Pleasant3 vs Unpleasant3": [0.58],
4 |     "Male vs Female /  Career vs Family": [1.89 ],
5 |     "Math vs Art / Male vs Female": [0.97],
6 |     "Science vs Art / Male vs Female": [1.24]
7 |   }
8 | 


--------------------------------------------------------------------------------
/results_compare_embeddings.json:
--------------------------------------------------------------------------------
1 | {"Google News": {"Math vs Art / Male8 vs Female8": [0.9536, 0.0482], "Christianity vs Islam /  Good vs Bad": [1.2832, 0.0546], "Christianity vs Judaism /  Good vs Bad": [-0.2248, 0.0878], "Arab + Muslim vs Other /  Good vs Bad": [-0.6628, 0.1184], "Straight vs Gay / Good vs Bad": [0.96, 0.1763], "Science vs Art / Male8 vs Female8": [1.175, 0.0411], "Male vs Female /  Career vs Family": [1.9006, 0.0074], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.5802, 0.0145], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.4469, 0.021], "Judaism vs Islam /  Good vs Bad": [1.2434, 0.0547], "European names vs African American Names /  Pleasant3 vs Unpleasant3": [0.5099, 0.0432]}, "Reddit": {"Math vs Art / Male8 vs Female8": [-0.3288, 0.1003], "Christianity vs Islam /  Good vs Bad": [0.7285, 0.0902], "Christianity vs Judaism /  Good vs Bad": [-0.3761, 0.0944], "Straight vs Gay / Good vs Bad": [1.32, 0.151], "Science vs Art / Male8 vs Female8": [0.0315, 0.0846], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.6374, 0.0127], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.4366, 0.0184], "Judaism vs Islam /  Good vs Bad": [0.8813, 0.0768]}}


--------------------------------------------------------------------------------
/results_histwords.json:
--------------------------------------------------------------------------------
1 | {"Histwords": {"1990": {"Math vs Art / Male8 vs Female8": [-0.2178, 0.097], "Christianity vs Islam /  Good vs Bad": [0.7892, 0.0936], "Christianity vs Judaism /  Good vs Bad": [0.5508, 0.0822], "Straight vs Gay / Good vs Bad": [1.12, 0.1665], "Science vs Art / Male8 vs Female8": [-1.0035, 0.0878], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.465, 0.02], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.9482, 0.0307], "Judaism vs Islam /  Good vs Bad": [0.0129, 0.1049]}, "1830": {"Math vs Art / Male8 vs Female8": [0.6742, 0.0708], "Christianity vs Islam /  Good vs Bad": [0.36, 0.1977], "Christianity vs Judaism /  Good vs Bad": [1.88, 0.0686], "Science vs Art / Male8 vs Female8": [1.4644, 0.0546], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.3434, 0.0332], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.0253, 0.0938], "Judaism vs Islam /  Good vs Bad": [-1.44, 0.1395]}, "1810": {"Math vs Art / Male8 vs Female8": [0.6764, 0.0688], "Christianity vs Islam /  Good vs Bad": [-0.2, 0.2], "Christianity vs Judaism /  Good vs Bad": [1.44, 0.1395], "Science vs Art / Male8 vs Female8": [1.363, 0.0602], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.3856, 0.0528], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.4648, 0.1026], "Judaism vs Islam /  Good vs Bad": [-1.36, 0.1474]}, "1980": {"Math vs Art / Male8 vs Female8": [-0.5036, 0.0866], "Christianity vs Islam /  Good vs Bad": [-0.2089, 0.1088], "Christianity vs Judaism /  Good vs Bad": [0.3208, 0.1015], "Straight vs Gay / Good vs Bad": [-0.0, 0.201], "Science vs Art / Male8 vs Female8": [-0.5708, 0.0776], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.4454, 0.021], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.5138, 0.0484], "Judaism vs Islam /  Good vs Bad": [-0.6322, 0.0986]}, "1800": {"Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.12, 0.2006], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.711, 0.0389], "Math vs Art / Male8 vs Female8": [1.3277, 0.0656], "Science vs Art / Male8 vs Female8": [1.5655, 0.0523], "Christianity vs Judaism /  Good vs Bad": [1.76, 0.0955]}, "1850": {"Math vs Art / Male8 vs Female8": [0.996, 0.0584], "Christianity vs Islam /  Good vs Bad": [1.0, 0.1741], "Christianity vs Judaism /  Good vs Bad": [1.96, 0.04], "Science vs Art / Male8 vs Female8": [1.1213, 0.073], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.457, 0.0243], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.7002, 0.0646], "Judaism vs Islam /  Good vs Bad": [-1.84, 0.0788]}, "1940": {"Math vs Art / Male8 vs Female8": [-0.5336, 0.0865], "Christianity vs Islam /  Good vs Bad": [1.04, 0.0764], "Christianity vs Judaism /  Good vs Bad": [1.055, 0.0679], "Science vs Art / Male8 vs Female8": [-0.5235, 0.0836], "Instruments vs Weapons /  Pleasant vs Unpleasant": [0.931, 0.0411], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.8622, 0.0507], "Judaism vs Islam /  Good vs Bad": [-0.1757, 0.0972]}, "1840": {"Math vs Art / Male8 vs Female8": [0.9635, 0.0653], "Christianity vs Islam /  Good vs Bad": [0.6, 0.1917], "Christianity vs Judaism /  Good vs Bad": [1.56, 0.1258], "Science vs Art / Male8 vs Female8": [1.4431, 0.0605], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.436, 0.0312], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.4256, 0.0849], "Judaism vs Islam /  Good vs Bad": [-1.0, 0.1741]}, "1960": {"Math vs Art / Male8 vs Female8": [-0.8122, 0.074], "Christianity vs Islam /  Good vs Bad": [-0.2377, 0.1105], "Christianity vs Judaism /  Good vs Bad": [0.6324, 0.1008], "Straight vs Gay / Good vs Bad": [-0.32, 0.1984], "Science vs Art / Male8 vs Female8": [-0.1685, 0.0841], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.2334, 0.0283], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.7823, 0.0438], "Judaism vs Islam /  Good vs Bad": [-1.0001, 0.0821]}, "1820": {"Math vs Art / Male8 vs Female8": [0.8802, 0.0707], "Christianity vs Islam /  Good vs Bad": [-0.08, 0.2008], "Christianity vs Judaism /  Good vs Bad": [1.72, 0.1026], "Science vs Art / Male8 vs Female8": [1.3887, 0.0583], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.3693, 0.0331], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.9509, 0.0943], "Judaism vs Islam /  Good vs Bad": [-1.6, 0.1206]}, "1970": {"Math vs Art / Male8 vs Female8": [-0.7636, 0.0845], "Christianity vs Islam /  Good vs Bad": [0.1711, 0.1043], "Christianity vs Judaism /  Good vs Bad": [0.8838, 0.079], "Straight vs Gay / Good vs Bad": [0.16, 0.2004], "Science vs Art / Male8 vs Female8": [-0.5369, 0.0875], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.5107, 0.0183], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.7652, 0.0462], "Judaism vs Islam /  Good vs Bad": [-0.6089, 0.0876]}, "1920": {"Math vs Art / Male8 vs Female8": [0.2376, 0.0735], "Christianity vs Islam /  Good vs Bad": [0.7562, 0.0837], "Christianity vs Judaism /  Good vs Bad": [1.3283, 0.0587], "Science vs Art / Male8 vs Female8": [0.7002, 0.0777], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.3371, 0.0241], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.2535, 0.0417], "Judaism vs Islam /  Good vs Bad": [-0.4834, 0.1001]}, "1860": {"Math vs Art / Male8 vs Female8": [1.1793, 0.0516], "Christianity vs Islam /  Good vs Bad": [-0.04, 0.201], "Christianity vs Judaism /  Good vs Bad": [1.96, 0.04], "Science vs Art / Male8 vs Female8": [1.3217, 0.0624], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.4387, 0.0324], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.0056, 0.0563], "Judaism vs Islam /  Good vs Bad": [-2.0, 0.0]}, "1930": {"Math vs Art / Male8 vs Female8": [0.4705, 0.0948], "Christianity vs Islam /  Good vs Bad": [-0.1858, 0.106], "Christianity vs Judaism /  Good vs Bad": [0.2581, 0.1012], "Science vs Art / Male8 vs Female8": [1.3398, 0.0495], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.083, 0.0311], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.1333, 0.0466], "Judaism vs Islam /  Good vs Bad": [-0.4414, 0.0867]}, "1950": {"Math vs Art / Male8 vs Female8": [0.0094, 0.0869], "Christianity vs Islam /  Good vs Bad": [0.0753, 0.102], "Christianity vs Judaism /  Good vs Bad": [0.4243, 0.1021], "Straight vs Gay / Good vs Bad": [0.08, 0.2008], "Science vs Art / Male8 vs Female8": [0.7273, 0.0812], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.1959, 0.0329], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.7161, 0.0471], "Judaism vs Islam /  Good vs Bad": [-0.4993, 0.1033]}, "1870": {"Math vs Art / Male8 vs Female8": [0.7645, 0.0714], "Christianity vs Islam /  Good vs Bad": [1.44, 0.1395], "Christianity vs Judaism /  Good vs Bad": [2.0, 0.0], "Science vs Art / Male8 vs Female8": [1.0882, 0.058], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.386, 0.0302], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.4463, 0.0731], "Judaism vs Islam /  Good vs Bad": [-1.92, 0.0563]}, "1910": {"Math vs Art / Male8 vs Female8": [0.6551, 0.0741], "Christianity vs Islam /  Good vs Bad": [0.8259, 0.0875], "Christianity vs Judaism /  Good vs Bad": [1.3547, 0.0594], "Science vs Art / Male8 vs Female8": [0.8195, 0.0607], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.288, 0.0265], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [1.0859, 0.0456], "Judaism vs Islam /  Good vs Bad": [-0.7938, 0.0968]}, "1890": {"Math vs Art / Male8 vs Female8": [0.5593, 0.0781], "Christianity vs Islam /  Good vs Bad": [0.48, 0.1951], "Christianity vs Judaism /  Good vs Bad": [1.2511, 0.0572], "Science vs Art / Male8 vs Female8": [1.1959, 0.0532], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.4768, 0.0213], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.9655, 0.05], "Judaism vs Islam /  Good vs Bad": [-1.36, 0.1474]}, "1900": {"Math vs Art / Male8 vs Female8": [0.7513, 0.0767], "Christianity vs Islam /  Good vs Bad": [0.7688, 0.0736], "Christianity vs Judaism /  Good vs Bad": [1.4998, 0.044], "Science vs Art / Male8 vs Female8": [0.7675, 0.0805], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.3443, 0.028], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.7097, 0.0457], "Judaism vs Islam /  Good vs Bad": [-1.1863, 0.0653]}, "1880": {"Math vs Art / Male8 vs Female8": [0.5081, 0.0759], "Christianity vs Islam /  Good vs Bad": [0.72, 0.1875], "Christianity vs Judaism /  Good vs Bad": [2.0, 0.0], "Science vs Art / Male8 vs Female8": [0.8756, 0.0763], "Instruments vs Weapons /  Pleasant vs Unpleasant": [1.4184, 0.025], "Flowers vs Insects (25) / Pleasant vs Unpleasant (25)": [0.8134, 0.05], "Judaism vs Islam /  Good vs Bad": [-1.72, 0.1026]}}}


--------------------------------------------------------------------------------
/run_tests.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Validate results with Caliskan Paper
 3 | 
 4 | USAGE: python run_tests.py path_to_google_news_corpus
 5 | """
 6 | import weat
 7 | import read_config
 8 | import sys
 9 | import json
10 | 
11 | def replicate_caliskan(embed_path):
12 |     print('loading caliskan embedding...')
13 |     embedding = weat.load_embedding(embed_path)
14 |     print('embedding loaded')
15 |     with open('configs/caliskan.json') as config_file:
16 |         config = json.load(config_file)
17 |     with open('results/caliskan.json') as res_file:
18 |         exp_results = json.load(res_file)
19 |     for name_of_test, test_config in config['tests'].items():
20 |         res = weat.diff_assoc(test_config['X'],test_config['Y'],test_config['A'],test_config['B'],embedding)
21 |         print(name_of_test + ':')
22 |         print('Result: {} Original Finding: {}\n'.format(res, exp_results[name_of_test][0]))
23 | 
24 | if __name__ == '__main__':
25 |     if len(sys.argv) < 2:
26 |         print('usage: python run_tests.py path_to_google_news_corpus')
27 |         sys.exit()
28 |     if len(sys.argv) > 1:
29 |         em_path = sys.argv[1]
30 |         replicate_caliskan(em_path)
31 |     print('tests complete')
32 | 


--------------------------------------------------------------------------------
/sgns-to-txt.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Convert pretrained histwords embeddings to be compatible with gensim
 3 | '''
 4 | 
 5 | import sys
 6 | import glob, os
 7 | import numpy as np
 8 | import pickle
 9 | 
10 | if __name__ == '__main__':
11 |     if len(sys.argv) < 2:
12 |         print('usage: python sgns-to-txt.py sgn-directory decade=all')
13 |         sys.exit(1)
14 |     sgn_dir = sys.argv[1]
15 |     if sgn_dir[-1] == '/':
16 |         sgn_dir = sgn_dir[:-1]
17 |      # Create target Directory if doesn't exist
18 |     outputdir = './' + sgn_dir + '-txts'
19 |     if not os.path.exists(outputdir):
20 |         os.mkdir(outputdir)
21 |         print('Directory {} created'.format(outputdir))
22 |     else:
23 |         print('Directory {} already exists'.format(outputdir))
24 |     if len(sys.argv) > 2:
25 |         decade = sys.argv[2]
26 |         vectors = np.load(sgn_dir + '/'+ decade + "-w.npy", mmap_mode="c")
27 |         f = open(sgn_dir  + '/' + decade + "-vocab.pkl", "rb")
28 |         vocab = pickle.load(f)
29 |         word_indicies = {w:i for i,w in enumerate(vocab)}
30 |         # embeddings = Embedding.load('../' + sgn_dir + '/' + decade)
31 |         vocab_size = len(vocab)
32 |         print('vocab_size: {}'.format(vocab_size))
33 |         vector_dim = len(vectors[0])
34 |         print('vector_dim: {}'.format(vector_dim))
35 | 
36 |         with open(outputdir + '/' + decade + '.txt', 'w') as fp:
37 |             fp.write(str(vocab_size) + ' ' + str(vector_dim) + '\n')
38 |             for word in vocab:
39 |                 fp.write((word + ' ' + ' '.join(map(str, (vectors[word_indicies[word], :]))) + '\n').encode('utf-8'))
40 |     else:
41 |         print("Changing directory to {}".format('./' + sgn_dir))
42 |         os.chdir('./' + sgn_dir)
43 |         print("Current directory is {}".format(os.getcwd()))
44 |         for file in glob.glob("*.npy"):
45 |             # get the year of the file
46 |             d = file[:4]
47 |             print("Loading embedding for {}".format(d))
48 |             vectors = np.load(d + "-w.npy", mmap_mode="c")
49 |             f = open(d + "-vocab.pkl", "rb")
50 |             vocab = pickle.load(f)
51 |             vocab_size = len(vocab)
52 |             vector_dim = len(vectors[0])
53 |             word_indicies = {w:i for i,w in enumerate(vocab)}
54 |             output_txt = '../' + outputdir.split('/')[-1] + '/' + d + '.txt'
55 |             print("Writing {}".format(output_txt))
56 |             with open(output_txt, 'w') as fp:
57 |                 fp.write(str(vocab_size) + ' ' + str(vector_dim) + '\n')
58 |                 for word in vocab:
59 |                     fp.write((word + ' ' + ' '.join(map(str, (vectors[word_indicies[word], :]))) + '\n'))
60 | 


--------------------------------------------------------------------------------
/weat.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gensim.models import KeyedVectors
  3 | import gensim
  4 | import random
  5 | import read_config
  6 | import sys
  7 | import glob
  8 | import os
  9 | import json
 10 | from gensim.models import Word2Vec
 11 | from scipy import stats
 12 | import sys
 13 | import math
 14 | 
 15 | def word_assoc(w,A,B,embedding):
 16 |     """
 17 |     Calculates difference in mean cosine similarity between a word and two sets
 18 |     of words.
 19 |     """
 20 |     return embedding.n_similarity([w],A) - embedding.n_similarity([w],B)
 21 | 
 22 | def diff_assoc(X,Y,A,B,embedding):
 23 |     """
 24 |     Caclulates the WEAT test statics for four sets of words in an embeddings
 25 |     """
 26 |     word_assoc_X = np.array(list(map(lambda x : word_assoc(x,A,B,embedding), X)))
 27 |     word_assoc_Y = np.array(list(map(lambda y : word_assoc(y,A,B,embedding), Y)))
 28 |     mean_diff = np.mean(word_assoc_X) - np.mean(word_assoc_Y)
 29 |     std = np.std(np.concatenate((word_assoc_X, word_assoc_Y), axis=0))
 30 |     return mean_diff / std
 31 | 
 32 | def get_bias_scores_mean_err(word_pairs,embedding):
 33 |     """
 34 |     Caculate the mean WEAT statistic and standard error using a permutation test
 35 |     on the sets of words (defaults to 100 samples)
 36 |     """
 37 |     # divide smaller word_list by two
 38 |     subset_size_target = min(len(word_pairs['X']),len(word_pairs['Y']))//2
 39 |     subset_size_attr = min(len(word_pairs['A']),len(word_pairs['B']))//2
 40 |     bias_scores = []
 41 |     for i in range(100):
 42 |         sX = np.random.choice(word_pairs['X'],subset_size_target,replace=False)
 43 |         sY = np.random.choice(word_pairs['Y'],subset_size_target,replace=False)
 44 |         sA = np.random.choice(word_pairs['A'],subset_size_attr,replace=False)
 45 |         sB = np.random.choice(word_pairs['B'],subset_size_attr,replace=False)
 46 |         bias_scores.append(diff_assoc(sX,sY,sA,sB,embedding))
 47 |     return np.mean(bias_scores), stats.sem(bias_scores)
 48 | 
 49 | 
 50 | def run_test(config, embedding):
 51 |     word_pairs = {}
 52 |     min_len = sys.maxsize
 53 |     # Only include words that are present in the word embedding
 54 |     for word_list_name, word_list in config.items():
 55 |         if word_list_name in ['X', 'Y', 'A', 'B']:
 56 |             word_list_filtered = list(filter(lambda x: x in embedding and np.count_nonzero(embedding[x]) > 0, word_list))
 57 |             word_pairs[word_list_name] = word_list_filtered
 58 |             if len(word_list_filtered) < 2:
 59 |                 print('ERROR: Words from list {} not found in embedding\n {}'.\
 60 |                 format(word_list_name, word_list))
 61 |                 print('All word groups must contain at least two words')
 62 |                 return None, None
 63 |     return get_bias_scores_mean_err(word_pairs,embedding)
 64 | 
 65 | def load_embedding(embed_path):
 66 |     if embed_path.endswith('wv'):
 67 |         return KeyedVectors.load(embed_path)
 68 |     elif embed_path.endswith('txt'):
 69 |         return KeyedVectors.load_word2vec_format(embed_path, binary=False)
 70 |     elif embed_path.endswith('bin'):
 71 |         return KeyedVectors.load_word2vec_format(embed_path, binary=True)
 72 |     # NOTE reddit embedding is saved as model (no ext) + syn1neg + syn0
 73 |     else:
 74 |         return Word2Vec.load(embed_path)
 75 | 
 76 | if __name__ == '__main__':
 77 |     if len(sys.argv) < 2:
 78 |         print('usage: python weat.py config.json results_file=config_results.json')
 79 |         sys.exit(1)
 80 | 
 81 |     fname = sys.argv[1]
 82 |     if len(sys.argv) > 2:
 83 |         results_file = sys.argv[2]
 84 |     else:
 85 |         results_file = 'results_' + fname
 86 |     results = {}
 87 |     config = read_config.read_json_config(fname)
 88 |     for e_name, e in config['embeddings'].items():
 89 |         results[e_name] = {}
 90 |         if not isinstance(e,dict):
 91 |             print('loading embedding {}...'.format(e_name))
 92 |             try:
 93 |                 embedding = load_embedding(e)
 94 |             except:
 95 |                 print('could not load embedding {}'.format(e_name))
 96 |                 continue;
 97 |             for name_of_test, test_config in config['tests'].items():
 98 |                 mean, err = run_test(test_config, embedding)
 99 |                 print('mean: {} err: {}'.format(mean, err))
100 |                 if mean is not None:
101 |                     results[e_name][name_of_test] = (round(mean, 4), round(err,4))
102 |         else:
103 |             print('loading time series embeddings...')
104 |             for time, embed_path in e.items():
105 |                 results[e_name][time] = {}
106 |                 embedding = load_embedding(embed_path)
107 |                 for name_of_test, test_config in config['tests'].items():
108 |                     print(name_of_test)
109 |                     mean, err = run_test(test_config, embedding)
110 |                     print('mean: {} err: {}'.format(mean, err))
111 |                     if mean is not None:
112 |                         results[e_name][time][name_of_test] = (round(mean, 4), round(err,4))
113 |         with open(results_file, 'wb') as outfile:
114 |             json.dump(results, outfile)
115 | 


--------------------------------------------------------------------------------