├── .gitignore
├── LICENSE
├── README.md
├── data.tar.gz
├── docs
    └── appendix.pdf
├── images
    ├── balanced_bug_logo.png
    ├── flow.png
    ├── full_bug_logo.png
    ├── gold_bug_logo.png
    └── spike_logo.png
├── predictions
    └── README.md
├── requirements.txt
├── src
    ├── converters
    │   └── convert_to_conll.py
    └── evaluations
    │   ├── Analyze.py
    │   ├── dataset_stats.py
    │   ├── evaluate_coref.py
    │   └── inc_occ_gender.csv
└── visualizations
    └── delta_s_by_dist.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Local
132 | predictions/
133 | data/
134 | *.conll


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SLAB-NLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- START doctoc generated TOC please keep comment here to allow auto update -->
  2 | <!-- DON'T EDIT THIS SECTION, INSTEAD RE-RUN doctoc TO UPDATE -->
  3 | **Table of Contents**  *generated with [DocToc](https://github.com/thlorenz/doctoc)*
  4 | 
  5 | - [BUG Dataset <img src="https://user-images.githubusercontent.com/6629995/132018898-038ec717-264d-4da3-a0b8-651b851f6b64.png" width="30" /><img src="https://user-images.githubusercontent.com/6629995/132017358-dea44bba-1487-464d-a9e1-4d534204570c.png" width="30" /><img src="https://user-images.githubusercontent.com/6629995/132018731-6ec8c4e3-12ac-474c-ae6c-03c1311777f4.png" width="30" />](#bug-dataset-img-srchttpsuser-imagesgithubusercontentcom6629995132018898-038ec717-264d-4da3-a0b8-651b851f6b64png-width30-img-srchttpsuser-imagesgithubusercontentcom6629995132017358-dea44bba-1487-464d-a9e1-4d534204570cpng-width30-img-srchttpsuser-imagesgithubusercontentcom6629995132018731-6ec8c4e3-12ac-474c-ae6c-03c1311777f4png-width30-)
  6 |   - [Setup](#setup)
  7 |   - [Dataset Partitions](#dataset-partitions)
  8 |     - [<img src="https://user-images.githubusercontent.com/6629995/132018898-038ec717-264d-4da3-a0b8-651b851f6b64.png" width="20" /> Full BUG](#img-srchttpsuser-imagesgithubusercontentcom6629995132018898-038ec717-264d-4da3-a0b8-651b851f6b64png-width20--full-bug)
  9 |     - [<img src="https://user-images.githubusercontent.com/6629995/132017358-dea44bba-1487-464d-a9e1-4d534204570c.png" width="20" /> Gold BUG](#img-srchttpsuser-imagesgithubusercontentcom6629995132017358-dea44bba-1487-464d-a9e1-4d534204570cpng-width20--gold-bug)
 10 |     - [<img src="https://user-images.githubusercontent.com/6629995/132018731-6ec8c4e3-12ac-474c-ae6c-03c1311777f4.png" width="20" /> Balanced BUG](#img-srchttpsuser-imagesgithubusercontentcom6629995132018731-6ec8c4e3-12ac-474c-ae6c-03c1311777f4png-width20--balanced-bug)
 11 |   - [Dataset Format](#dataset-format)
 12 |   - [Evaluations](#evaluations)
 13 |     - [Coreference](#coreference)
 14 |   - [Conversions](#conversions)
 15 |     - [CoNLL](#conll)
 16 |   - [Citing](#citing)
 17 | 
 18 | <!-- END doctoc generated TOC please keep comment here to allow auto update -->
 19 | 
 20 | #  BUG Dataset <img src="https://user-images.githubusercontent.com/6629995/132018898-038ec717-264d-4da3-a0b8-651b851f6b64.png" width="30" /><img src="https://user-images.githubusercontent.com/6629995/132017358-dea44bba-1487-464d-a9e1-4d534204570c.png" width="30" /><img src="https://user-images.githubusercontent.com/6629995/132018731-6ec8c4e3-12ac-474c-ae6c-03c1311777f4.png" width="30" />
 21 | A Large-Scale Gender Bias Dataset for Coreference Resolution and Machine Translation (Levy et al., Findings of EMNLP 2021).
 22 | 
 23 | BUG was collected semi-automatically from different real-world corpora, designed to be challenging in terms of soceital gender role assignements for machine translation and coreference resolution.
 24 | 
 25 | ## Setup
 26 | 
 27 | 1. Unzip `data.tar.gz` this should create a `data` folder with the following files:
 28 |    * balanced_BUG.csv
 29 |    * full_BUG.csv
 30 |    * gold_BUG.csv
 31 | 2. Setup a python 3.x environment and install requirements:
 32 | ```
 33 | pip install -r requirements.txt
 34 | ```
 35 | 
 36 | 
 37 | ## Dataset Partitions
 38 | 
 39 | **_NOTE:_**
 40 | These partitions vary slightly from those reported in the paper due improvments and bug fixes post submission. 
 41 | For reprducibility's sake, you can access the dataset from the submission [here](https://drive.google.com/file/d/1b4Q-X1vVMoR-tIVd-XCigamnvpy0vi3F/view?usp=sharing).
 42 | 
 43 | ### <img src="https://user-images.githubusercontent.com/6629995/132018898-038ec717-264d-4da3-a0b8-651b851f6b64.png" width="20" /> Full BUG
 44 | 105,687 sentences with a human entity, identified by their profession and a gendered pronoun.
 45 | 
 46 | ### <img src="https://user-images.githubusercontent.com/6629995/132017358-dea44bba-1487-464d-a9e1-4d534204570c.png" width="20" /> Gold BUG 
 47 | 
 48 | 1,717 sentences, the gold-quality human-validated samples.
 49 | 
 50 | ### <img src="https://user-images.githubusercontent.com/6629995/132018731-6ec8c4e3-12ac-474c-ae6c-03c1311777f4.png" width="20" /> Balanced BUG
 51 | 25,504 sentences, randomly sampled from Full BUG to ensure balance between male and female entities and between stereotypical and non-stereotypical gender role assignments.
 52 | 
 53 | 
 54 | ## Dataset Format
 55 | Each file in the data folder is a csv file adhering to the following format:
 56 | 
 57 | 
 58 | Column | Header                 | Description
 59 | :-----:|------------------------|--------------------------------------------
 60 | 1      | sentence_text          | Text of sentences with a human entity, identified by their profession and a gendered pronoun
 61 | 2      | tokens                 | List of tokens (using spacy tokenizer)
 62 | 3      | profession             | The entity in the sentence
 63 | 4      | g                      | The pronoun in the sentence
 64 | 5      | profession_first_index | Words offset of profession in sentence
 65 | 6      | g_first_index          | Words offset of pronoun in sentence
 66 | 7      | predicted gender       | 'male'/'female' determined by the pronoun
 67 | 8      | stereotype             | -1/0/1 for anti-stereotype, neutral and stereotype sentence
 68 | 9      | distance               | The abs distance in words between pronoun and profession
 69 | 10      | num_of_pronouns        | Number of pronouns in the sentence
 70 | 11     | corpus                 | The corpus from which the sentence is taken
 71 | 12     | data_index             | The query index of the pattern of the sentence
 72 | 
 73 | ## Evaluations
 74 | See below instructions for reproducing our evaluations on BUG.
 75 | 
 76 | ### Coreference
 77 | 1. Download the Spanbert predictions from [this link](https://drive.google.com/file/d/1i24T1YT_0ByxttrCRR7qxEnt8UWyEJ7R/view?usp=sharing).
 78 | 2. Unzip and put `coref_preds.jsonl` in in the `predictions/` folder.
 79 | 3. From `src/evaluations/`, run `python evaluate_coref.py --in=../../predictions/coref_preds.jsonl --out=../../visualizations/delta_s_by_dist.png`.
 80 | 4. This should reproduce the [coreference evaluation figure](visualizations/delta_s_by_dist.png).
 81 | 
 82 | 
 83 | ## Conversions
 84 | ### CoNLL
 85 | To convert each data partition to CoNLL format run:
 86 | ```
 87 | python convert_to_conll.py --in=path/to/input/file --out=path/to/output/file
 88 | ```
 89 | 
 90 | For example, try:
 91 | ```
 92 | python convert_to_conll.py --in=../../data/gold_BUG.csv --out=./gold_bug.conll
 93 | ```
 94 | 
 95 | ### Filter from SPIKE
 96 | 1. Download the wanted [SPIKE](https://spike.apps.allenai.org/) csv files and save them all in the same directory (directory_path).
 97 | 2. Make sure the name of each file end with `\_<corpusquery><x>.csv` where `corpus` is the name of the SPIKE dataset and `x` is the number of query you entered on search (for example - myspikedata_wikipedia18.csv).
 98 | 3. From `src/evaluations/`, run `python Analyze.py directory_path`.
 99 | 4. This should reproduce the full dataset and balanced dataset.
100 | 
101 | 
102 | ## Citing
103 | ```
104 | @misc{levy2021collecting,
105 |       title={Collecting a Large-Scale Gender Bias Dataset for Coreference Resolution and Machine Translation}, 
106 |       author={Shahar Levy and Koren Lazar and Gabriel Stanovsky},
107 |       year={2021},
108 |       eprint={2109.03858},
109 |       archivePrefix={arXiv},
110 |       primaryClass={cs.CL}
111 | }
112 | ```
113 | 
114 | 


--------------------------------------------------------------------------------
/data.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/data.tar.gz


--------------------------------------------------------------------------------
/docs/appendix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/docs/appendix.pdf


--------------------------------------------------------------------------------
/images/balanced_bug_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/images/balanced_bug_logo.png


--------------------------------------------------------------------------------
/images/flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/images/flow.png


--------------------------------------------------------------------------------
/images/full_bug_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/images/full_bug_logo.png


--------------------------------------------------------------------------------
/images/gold_bug_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/images/gold_bug_logo.png


--------------------------------------------------------------------------------
/images/spike_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/images/spike_logo.png


--------------------------------------------------------------------------------
/predictions/README.md:
--------------------------------------------------------------------------------
1 | Download the SpanBERT predictions from [this link](https://drive.google.com/file/d/1i24T1YT_0ByxttrCRR7qxEnt8UWyEJ7R/view?usp=sharing) and unzip in this folder.
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib
2 | docopt
3 | tqdm
4 | numpy
5 | pandas
6 | 


--------------------------------------------------------------------------------
/src/converters/convert_to_conll.py:
--------------------------------------------------------------------------------
  1 | """ Usage:
  2 |     <file-name> [--in=INPUT_FILE] [--out=OUTPUT_FILE] [--debug]
  3 | 
  4 | Options:
  5 |   --help                           Show this message and exit
  6 |   -i INPUT_FILE --in=INPUT_FILE    Input file
  7 |                                    [default: infile.tmp]
  8 |   -o INPUT_FILE --out=OUTPUT_FILE  Input file
  9 |                                    [default: outfile.tmp]
 10 |   --debug                          Whether to debug
 11 | """
 12 | # External imports
 13 | import logging
 14 | import pdb
 15 | from pprint import pprint
 16 | from pprint import pformat
 17 | from docopt import docopt
 18 | from pathlib import Path
 19 | from tqdm import tqdm
 20 | import json
 21 | import pandas as pd
 22 | 
 23 | # Local imports
 24 | 
 25 | 
 26 | #----
 27 | 
 28 | 
 29 | HEADER = "#begin document ({doc_name}); part 000"
 30 | FOOTER = "\n#end document"
 31 | BOILER = ["-"] * 5 + ["Speaker#1"] + ["*"] * 4
 32 | ENTITY = "(1)"
 33 | BLIST = [13055, 13996, ] # indices which the converter doesn't like for some reason
 34 | GENRE = "nw" # conll parsing requires some genre, "nw" follows the convention
 35 |              # in winobias, but is probably arbitrary otherwise.
 36 | 
 37 | 
 38 | def validate_row(row):
 39 |     """
 40 |     run sanity checks on row, return true iff they pass
 41 |     """
 42 |     prof_ind = row.profession_first_index
 43 |     pron_ind = row.g_first_index
 44 |     
 45 |     words = row.sentence_text.lstrip().rstrip().split(" ")
 46 |     num_of_words = len(words)
 47 |     prof = row["profession"].lower()
 48 |     pron = row["g"].lower()
 49 | 
 50 | 
 51 |     # make sure inner references in the line make sense
 52 |     if prof_ind >= len(words):
 53 |         return False
 54 |     if pron_ind >= len(words):
 55 |         return False
 56 |     if words[prof_ind].lower() != prof:
 57 |         logging.debug(f"prof doesn't match")
 58 |         return False
 59 |     if words[pron_ind].lower() != pron:
 60 |         logging.debug(f"pron longer than a single word")
 61 |         return False
 62 | 
 63 |     # don't deal with weird empty tokens
 64 |     if any([(str.isspace(word) or (not word)) for word in words]):
 65 |         return False
 66 | 
 67 |     # all tests passed
 68 |     return True
 69 |     
 70 | 
 71 | def convert_row_to_conll(row, doc_name):
 72 |     """
 73 |     get a conll multi-line string representing a csv row
 74 |     """
 75 |     # find prof_index
 76 |     prof_ind = row.profession_first_index
 77 | 
 78 |     # find pronoun
 79 |     pron_ind = row.g_first_index
 80 | 
 81 |     # construct conll rows
 82 |     conll = []
 83 |     words = row.sentence_text.lstrip().rstrip().split(" ")
 84 |     prof = row["profession"].lower()
 85 |     pron = row["g"].lower()
 86 |     for word_ind, word in enumerate(words):
 87 |         word_lower = word.lower()
 88 |         coref_flag = "-"
 89 |         
 90 |         if word_ind == prof_ind:
 91 |             coref_flag = ENTITY
 92 | 
 93 |         elif word_ind == pron_ind:
 94 |             coref_flag = ENTITY
 95 |                    
 96 |         metadata = list(map(str, [doc_name, 0, word_ind, word]))
 97 |         conll_row = metadata + BOILER + [coref_flag]
 98 |         conll.append("\t".join(conll_row))
 99 | 
100 | 
101 |     conll_data_str = "\n".join(conll)
102 |     header = HEADER.format(doc_name = doc_name)
103 |     full_conll = "\n".join([header,conll_data_str,FOOTER])
104 |     return full_conll
105 |     
106 | 
107 | if __name__ == "__main__":
108 |     # Parse command line arguments
109 |     args = docopt(__doc__)
110 |     inp_fn = Path(args["--in"])
111 |     out_fn = Path(args["--out"])
112 | 
113 |     # Determine logging level
114 |     debug = args["--debug"]
115 |     if debug:
116 |         logging.basicConfig(level = logging.DEBUG)
117 |     else:
118 |         logging.basicConfig(level = logging.INFO)
119 | 
120 |     logging.info(f"Input file: {inp_fn}, Output file: {out_fn}.")
121 | 
122 |     # Start computation
123 |     df = pd.read_csv(inp_fn)
124 |     top_doc_name = out_fn.stem
125 | 
126 |     err_cnt = 0
127 |     
128 |     with open(out_fn, "w", encoding = "utf8") as fout:
129 |         for row_index, row in tqdm(df.iterrows()):
130 |             try:
131 |                 valid = validate_row(row)
132 |             except:
133 |                 err_cnt += 1
134 |                 continue
135 |             if not valid:
136 |                 # Something is wrong with this row
137 |                 # recover and continue
138 |                 err_cnt += 1
139 |                 continue
140 | 
141 |             if row_index in BLIST:
142 |                 err_cnt += 1
143 |                 continue
144 |             
145 |             doc_name = f"{GENRE}/{top_doc_name}/{row_index}"
146 |             try:
147 |                 conll = convert_row_to_conll(row, doc_name)
148 |             except:
149 |                 err_cnt += 1
150 |                 continue
151 |             fout.write(f"{conll}\n")
152 | 
153 |     total_rows = len(df)
154 |     perc = round((err_cnt / total_rows)*100)
155 |     rows_written = total_rows - err_cnt
156 |     logging.debug(f"""Wrote a total of {rows_written} to {out_fn}.
157 |     Filtered out {err_cnt} ({perc}%) rows out of {total_rows} total rows.""")
158 |     
159 |     # End
160 |     logging.info("DONE")
161 | 


--------------------------------------------------------------------------------
/src/evaluations/Analyze.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage: Analyze.py directory_path
  3 | """
  4 | 
  5 | import pandas as pd
  6 | import os
  7 | import sys
  8 | import matplotlib.pyplot as plt
  9 | import re
 10 | from tqdm import tqdm
 11 | 
 12 | all_entities, m_entities, f_entities, n_entities = [], [], [], []
 13 | 
 14 | f_pronouns = ["she", "herself", "her", "She", "Herself", "Her"]
 15 | m_pronouns = ["he", "his", "himself", "him", "He", "His", "Himself", "Him"]
 16 | 
 17 | 
 18 | def create_lists(path):
 19 |     """
 20 |     Takes a txt file of possible entities and fills up the python lists of entities.
 21 |     """
 22 |     df = pd.read_csv(path)
 23 | 
 24 |     for index, row in df.iterrows():
 25 |         if isinstance(row["entities"], str):
 26 |             entities = row["entities"].split(",")
 27 |             entities = [e.rstrip().lstrip().lower() for e in entities]
 28 |             all_entities.extend(entities)
 29 |             f = int(row["F_workers"])
 30 |             m = int(row["M_workers"])
 31 |             if f > m:
 32 |                 f_entities.extend(entities)
 33 |             elif m > f:
 34 |                 m_entities.extend(entities)
 35 |             else:
 36 |                 n_entities.extend(entities)
 37 | 
 38 | 
 39 | def drop_less_quality_data(data):
 40 |     """
 41 |     drop less quality data (discovered by the human annotations statistics)
 42 |     """
 43 |     data = data[data['data_index'] != "6"]
 44 |     data = data[data['data_index'] != "7"]
 45 |     data = data[data['data_index'] != "8"]
 46 |     data = data[data['data_index'] != "12"]
 47 |     data = data[data['data_index'] != "13"]
 48 |     data = data[data['data_index'] != "16"]
 49 |     data = data[data['data_index'] != "19"]
 50 |     data = data[data['corpus'] != "perseus"]
 51 |     data = data[data['distance'] <= 20]
 52 |     data = data[data['num_of_pronouns'] <= 4]
 53 |     return data
 54 | 
 55 | 
 56 | def create_BUG(directory_path):
 57 |     """
 58 |     Receives the spike data and a list of possible entities,
 59 |     filters the data such that the professions column will be only words from the list
 60 |     and creates 2 csv files:
 61 |     the filtered data
 62 |     the distribution over the professions
 63 |     """
 64 |     new_data = pd.DataFrame()
 65 |     i = 0
 66 |     for file_name in os.listdir(directory_path):
 67 |         i += 1
 68 |         if file_name.endswith("tsv"):
 69 |             data = pd.read_csv(os.path.join(directory_path, file_name), sep='\t')
 70 |         elif file_name.endswith("csv"):
 71 |             data = pd.read_csv(os.path.join(directory_path, file_name))
 72 |         else:
 73 |             continue
 74 |         data = data.rename(columns={"er": "g", "gender": "g"})
 75 |         data = data.rename(columns={"er_first_index": "g_first_index", "gender_first_index": "g_first_index"})
 76 |         data = data.rename(columns={"er_last_index": "g_last_index", "gender_last_index": "g_last_index"})
 77 |         data['profession'] = data['profession'].str.lower()
 78 |         data['data_index'] = file_name[-5] if not file_name[-6].isdigit() else file_name[-6:-4]
 79 |         data['corpus'] = file_name.split("_")[0]
 80 |         # assumption: path is- ...._datax.[t,c]sv, while x is the index of the query
 81 |         new_data = new_data.append(data[data['profession'].isin(all_entities)])
 82 | 
 83 |     new_data['stereotype'] = new_data[['profession', 'g']].apply(is_stereotype, axis=1)
 84 |     new_data['distance'] = new_data[['g_first_index', 'profession_first_index']].apply(find_distance, axis=1)
 85 |     new_data['num_of_pronouns'] = new_data[['sentence_text']].apply(num_of_pronouns, axis=1)
 86 | 
 87 |     new_data = new_data.drop_duplicates(subset=['sentence_text'], keep="last")
 88 |     duplicate_pronouns = ["he or she", "she or he", "her or his", "his or her", "her / his", "his / her",
 89 |                           "he / she", "she / he"]
 90 |     for p in duplicate_pronouns:
 91 |         new_data = new_data[~new_data.sentence_text.str.contains(p)]
 92 | 
 93 |     new_data['predicted gender'] = new_data[['g']].apply(predict_gender, axis=1)
 94 | 
 95 |     new_data = drop_less_quality_data(new_data)
 96 |     new_data = clean_columns(new_data)
 97 | 
 98 |     new_data.to_csv("data\\full_BUG.csv")
 99 |     professions = new_data['profession']
100 |     distribution = professions.value_counts()
101 |     distribution.to_csv("data/data_distribution.csv")
102 |     create_balanced("data\\full_BUG.csv")
103 | 
104 |     with open('dropped_en.txt', 'r', encoding="utf8") as f:
105 |         dropped_rows = [line.strip() + "\n" for line in f]
106 | 
107 |     with open("data/en_pro.txt", "w+", encoding="utf-8") as output_file_pro, \
108 |             open("data/en_anti.txt", "w+", encoding="utf-8") as output_file_anti, \
109 |             open("data/en.txt", "w+", encoding="utf-8") as output_file_all:
110 |         for index, row in tqdm(new_data.iterrows()):
111 |             is_dropped = False
112 |             for line in dropped_rows:
113 |                 if row["sentence_text"] in line:
114 |                     is_dropped = True
115 |                     break
116 |             if not is_dropped:
117 |                 line = row['predicted gender'] + "\t" + str(row['profession_first_index']) + "\t" + \
118 |                        row['sentence_text'] + "\t" + row['profession'] + "\t" + row['corpus']
119 |                 if row["stereotype"] == 1:
120 |                     output_file_pro.write(line + "\n")
121 |                 elif row["stereotype"] == -1:
122 |                     output_file_anti.write(line + "\n")
123 |                 output_file_all.write(line + "\n")
124 | 
125 |     corpora = ["wikipedia", "covid19", "pubmed"]
126 |     for corpus in corpora:
127 |         with open("data/en_{}_pro.txt".format(corpus), "w+", encoding="utf-8") as output_file_pro, \
128 |                 open("data/en_{}_anti.txt".format(corpus), "w+", encoding="utf-8") as output_file_anti, \
129 |                 open("data/en_{}.txt".format(corpus), "w+", encoding="utf-8") as output_file_all:
130 |             for index, row in tqdm(new_data.iterrows()):
131 |                 is_dropped = False
132 |                 for line in dropped_rows:
133 |                     if row["sentence_text"] in line:
134 |                         is_dropped = True
135 |                         break
136 |                 if not is_dropped and row['corpus'] == corpus:
137 |                     line = row['predicted gender'] + "\t" + str(row['profession_first_index']) + "\t" + \
138 |                            row['sentence_text'] + "\t" + row['profession'] + "\t" + row['corpus']
139 |                     if row["stereotype"] == 1:
140 |                         output_file_pro.write(line + "\n")
141 |                     elif row["stereotype"] == -1:
142 |                         output_file_anti.write(line + "\n")
143 |                     output_file_all.write(line + "\n")
144 | 
145 |     print("Size of data: " + str(new_data.shape[0]))
146 |     return new_data
147 | 
148 | 
149 | def clean_columns(df):
150 |     cleaned_df = df[['sentence_text', 'profession', 'g', 'profession_first_index', 'g_first_index',
151 |                      'predicted gender', 'stereotype', 'distance', 'num_of_pronouns', 'corpus', 'data_index']]
152 |     return cleaned_df
153 | 
154 | 
155 | def create_balanced(data_path):
156 |     data = pd.read_csv(data_path)
157 |     data_female = data[data["g"] == "her"]
158 |     data_female = data_female.append(data[data["g"] == "she"])
159 |     data_female = data_female.append(data[data["g"] == "herself"])
160 |     data_male = data[data["g"] == "his"]
161 |     data_male = data_male.append(data[data["g"] == "he"])
162 |     data_male = data_male.append(data[data["g"] == "himself"])
163 |     data_f_a = data_female[data_female["stereotype"] == -1]
164 |     data_f_s = data_female[data_female["stereotype"] == 1]
165 |     data_m_a = data_male[data_male["stereotype"] == -1]
166 |     data_m_s = data_male[data_male["stereotype"] == 1]
167 | 
168 |     n = data_f_s.shape[0]
169 |     balanced = data_f_s
170 |     balanced = balanced.append(data_f_a.sample(n=n))
171 |     balanced = balanced.append(data_m_s.sample(n=n))
172 |     balanced = balanced.append(data_m_a.sample(n=n))
173 |     balanced.to_csv("data/balanced_BUG.csv")
174 | 
175 | 
176 | def remove_tags_from_sentence(df):
177 |     """
178 |     Adds ref tag to the gender pronoun word, and ent tag to the entity word.
179 |     """
180 |     df = df.values
181 |     sentence = df[0]
182 |     try:
183 |         sentence_a = sentence.split("<")[0]
184 |         sentence_b = sentence.split(">")[1]
185 |         sentence_c = sentence_b.split("<")[0]
186 |         sentence_d = sentence.split(">")[2]
187 |     except:
188 |         print(sentence)
189 |         return
190 |     return sentence_a + sentence_c + sentence_d
191 | 
192 | 
193 | def add_tags_to_sentence(df):
194 |     """
195 |     Adds ref tag to the gender pronoun word, and ent tag to the entity word.
196 |     """
197 |     df = df.values
198 |     sentence = df[0]
199 |     p_idx = int(df[1])
200 |     split_sentence = sentence.split(' ')
201 |     split_sentence[p_idx] = "<ent>" + split_sentence[p_idx] + "</ent>"
202 |     return " ".join(split_sentence)
203 | 
204 | 
205 | def find_distance(df):
206 |     """
207 |     Finds the number of words between the gender pronoun word, and the entity word.
208 |     """
209 |     df = df.values
210 |     g_idx = df[0]
211 |     p_idx = df[1]
212 |     return abs(g_idx - p_idx)
213 | 
214 | 
215 | def predict_gender(df):
216 |     """
217 |     returns "male" is the pronoun in the sentence is a male pronoun, and "female" otherwise
218 |     """
219 |     df = df.values
220 |     g = df[0]
221 |     if g in m_pronouns:
222 |         return "male"
223 |     return "female"
224 | 
225 | 
226 | def is_stereotype(df):
227 |     """
228 |     1 : stereotype, -1 : non-stereotype, 0 neutral entity
229 |     """
230 |     df = df.values
231 |     profession = df[0]
232 |     g = df[1]
233 |     if (profession in f_entities and g in f_pronouns) or (profession in m_entities and g in m_pronouns):
234 |         return 1
235 |     if (profession in f_entities and g in m_pronouns) or (profession in m_entities and g in f_pronouns):
236 |         return -1
237 |     return 0
238 | 
239 | 
240 | def num_of_pronouns(df):
241 |     """
242 |     Finds the number of words between the gender pronoun word, and the entity word.
243 |     """
244 |     sentence = df.values[0].lower()
245 |     ans = 0
246 |     pronouns = ["he", "she", "his", "himself", "herself", "her", "him"]
247 |     try:
248 |         for word in pronouns:
249 |             ans += sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(word), sentence))
250 |         return ans
251 |     except:
252 |         return 1
253 | 
254 | 
255 | def samples_for_double_validation(data):
256 |     """
257 |     Sample 200 random example from the annotations for validate we agree
258 |     """
259 |     data['validation'] = 0
260 |     new_data = data.sample(n=200)
261 |     new_data[['predicted gender', 'validation', 'sentence_text', 'profession', 'g', 'g_first_index',
262 |                 'profession_first_index', 'stereotype',
263 |                 'corpus', 'data_index', 'correct']].to_csv("double_validation.csv")
264 | 
265 | 
266 | def samples_to_classify(data):
267 |     """
268 |     Sample 1000 random example from the data for us to classify the sentences as correct/incorrect reference
269 |     Sample 50 random examples from each corpus and for each query
270 |     """
271 |     data['correct?'] = 0
272 |     new_data = data.sample(n=1000)
273 |     new_data['sentence_text'] = \
274 |         new_data[['sentence_text', 'profession_first_index']].apply(add_tags_to_sentence, axis=1)
275 |     for i in range(1, 11):
276 |         df_new = new_data[100 * (i - 1):100 * i]
277 |         df_new[['predicted gender', 'correct?', 'sentence_text', 'profession', 'g', 'g_first_index',
278 |                 'profession_first_index', 'stereotype',
279 |                 'corpus', 'data_index']].to_csv("human_annotation\\data_samples{}.csv".format(i))
280 | 
281 |     for i in range(1, 22):
282 |         df_new = data[data.data_index == str(i)].sample(n=50)
283 |         df_new['sentence_text'] = \
284 |             df_new[['sentence_text', 'profession_first_index']].apply(add_tags_to_sentence, axis=1)
285 |         df_new[['predicted gender', 'correct?', 'sentence_text', 'profession', 'g', 'g_first_index',
286 |                 'profession_first_index', 'stereotype',
287 |                 'corpus', 'data_index']].to_csv("human_annotation\\data_samples_query{}.csv".format(i))
288 | 
289 |     for i in ["wikipedia", "perseus", "covid19", "pubmed"]:
290 |         df_new = data[data.corpus == i].sample(n=50)
291 |         df_new['sentence_text'] = \
292 |             df_new[['sentence_text', 'profession_first_index']].apply(add_tags_to_sentence, axis=1)
293 |         df_new[['predicted gender', 'correct?', 'sentence_text', 'profession', 'g', 'g_first_index',
294 |                 'profession_first_index', 'stereotype',
295 |                 'corpus', 'data_index']].to_csv("human_annotation\\data_samples_{}.csv".format(i))
296 | 
297 | 
298 | def plot_dict(dict_attributes, color, title):
299 |     """
300 |     receives a dictionary of attributes and sentences count / sentences statistics, ant plot the data as histograms
301 |     """
302 |     histo = plt.bar(dict_attributes.keys(), dict_attributes.values(), color=color)
303 |     for bar in histo:
304 |         height = bar.get_height()
305 |         plt.annotate(height if height > 1 else "{:.2f}".format(height),
306 |                      xy=(bar.get_x() + bar.get_width() / 2, height),
307 |                      xytext=(0, 2), textcoords="offset points", ha='center', va='bottom')
308 |     plt.title(title)
309 |     plt.savefig("data/graphs/{}.png".format(title))
310 |     plt.close()
311 | 
312 | 
313 | def statistics_over_samples(data):
314 |     """
315 |     plot statistics over data of human annotations.
316 |     numbers over histograms are:
317 |         (number of "correct" samples with attribute) / (number of all samples with attribute)
318 |     comment out code lines are data with less quality.
319 |     """
320 | 
321 |     print("accuracy of human annotations" + str(sum(data['correct'] == '1') / data.shape[0]))
322 | 
323 | 
324 | 
325 |     statistics_gender_pronouns = {
326 |         "himself": sum(data[data['g'] == 'himself']['correct'] == '1') / sum(data['g'] == 'himself'),
327 |         "he": sum(data[data['g'] == 'he']['correct'] == '1') / sum(data['g'] == 'he'),
328 |         "she": sum(data[data['g'] == 'she']['correct'] == '1') / sum(data['g'] == 'she'),
329 |         "herself": sum(data[data['g'] == 'herself']['correct'] == '1') / sum(data['g'] == 'herself'),
330 |         "his": sum(data[data['g'] == 'his']['correct'] == '1') / sum(data['g'] == 'his'),
331 |         "her": sum(data[data['g'] == 'her']['correct'] == '1') / sum(data['g'] == 'her'),
332 |         "female": (sum(data[data['g'] == 'her']['correct'] == '1') +
333 |                    sum(data[data['g'] == 'she']['correct'] == '1') +
334 |                   sum(data[data['g'] == 'herself']['correct'] == '1')) / \
335 |                   (sum(data['g'] == 'her') + sum(data['g'] == 'she') + sum(data['g'] == 'herself')),
336 |         "male": (sum(data[data['g'] == 'he']['correct'] == '1') +
337 |                  sum(data[data['g'] == 'his']['correct'] == '1') + sum(data[data['g'] == 'him']['correct'] == '1') +
338 |                  sum(data[data['g'] == 'himself']['correct'] == '1')) / \
339 |                 (sum(data['g'] == 'he') + sum(data['g'] == 'his') + sum(data['g'] == 'himself') +
340 |                  sum(data['g'] == 'him'))
341 |     }
342 |     plot_dict(statistics_gender_pronouns, "pink", "Statistics - Gender Pronouns")
343 | 
344 |     data['distance'] = data[['g_first_index', 'profession_first_index']].apply(find_distance, axis=1)
345 |     # distance_average = sum(data['distance']) / 300
346 |     statistics_distance = {
347 |         "d <= 5": sum(data[data['distance'] <= 5]['correct'] == '1') / sum(data['distance'] <= 5),
348 |         "5 < d <= 10": (sum(data[data['distance'] <= 10]['correct'] == '1') -
349 |                         sum(data[data['distance'] <= 5]['correct'] == '1')) /
350 |                        (sum(data['distance'] <= 10) - sum(data['distance'] <= 5)),
351 |         "10 < d <= 15": (sum(data[data['distance'] <= 15]['correct'] == '1') -
352 |                          sum(data[data['distance'] <= 10]['correct'] == '1')) /
353 |                         (sum(data['distance'] <= 15) - sum(data['distance'] <= 10)),
354 |         "15 < d <=20": (sum(data[data['distance'] <= 20]['correct'] == '1') -
355 |                         sum(data[data['distance'] <= 15]['correct'] == '1')) /
356 |                        (sum(data['distance'] <= 20) - sum(data['distance'] <= 15)),
357 |         # "20 < d": sum(data[data['distance'] > 20]['correct'] == '1') / sum(data['distance'] > 20)
358 |     }
359 |     plot_dict(statistics_distance, "teal", "Statistics - Distance Between Words")
360 | 
361 |     m_data = data[data['profession'].isin(m_entities)]
362 |     f_data = data[data['profession'].isin(f_entities)]
363 |     n_data = data[data['profession'].isin(n_entities)]
364 | 
365 |     # data[f_data[f_data['g'] == 'he']['correct'] == '0']
366 |     #
367 |     # "non-stereotypes": (sum(f_data[f_data['g'] == 'he']['correct'] == '1') +
368 |     #                     sum(f_data[f_data['g'] == 'his']['correct'] == '1') +
369 |     #                     sum(m_data[m_data['g'] == 'her']['correct'] == '1') +
370 |     #                     sum(m_data[m_data['g'] == 'she']['correct'] == '1')) / \
371 |     #                    (sum(f_data['g'] == 'he') + sum(f_data['g'] == 'his') +
372 |     #                     sum(m_data['g'] == 'her') + sum(m_data['g'] == 'she')),
373 | 
374 |     statistics_male_female = {
375 |         "male\n entities": sum(m_data['correct'] == '1') / len(m_data),
376 |         "female\n entities": sum(f_data['correct'] == '1') / len(f_data),
377 |         "neutral\n entities": sum(n_data['correct'] == '1') / len(n_data),
378 |     }
379 |     plot_dict(statistics_male_female, "darkmagenta", "Statistics - Male, Female Entities")
380 | 
381 |     statistics_stereotype = {
382 |         "stereotypes": (sum(m_data[m_data['g'] == 'he']['correct'] == '1') +
383 |                         sum(m_data[m_data['g'] == 'his']['correct'] == '1') +
384 |                         sum(f_data[f_data['g'] == 'her']['correct'] == '1') +
385 |                         sum(f_data[f_data['g'] == 'she']['correct'] == '1')) / \
386 |                        (sum(m_data['g'] == 'he') + sum(m_data['g'] == 'his') +
387 |                         sum(f_data['g'] == 'her') + sum(f_data['g'] == 'she')),
388 |         "s\n male": (sum(m_data[m_data['g'] == 'he']['correct'] == '1') +
389 |                      sum(m_data[m_data['g'] == 'his']['correct'] == '1')) / \
390 |                     (sum(m_data['g'] == 'he') + sum(m_data['g'] == 'his')),
391 |         "s\n female": (sum(f_data[f_data['g'] == 'her']['correct'] == '1') +
392 |                        sum(f_data[f_data['g'] == 'she']['correct'] == '1')) / \
393 |                       (sum(f_data['g'] == 'her') + sum(f_data['g'] == 'she')),
394 |         "non-stereotypes": (sum(f_data[f_data['g'] == 'he']['correct'] == '1') +
395 |                             sum(f_data[f_data['g'] == 'his']['correct'] == '1') +
396 |                             sum(m_data[m_data['g'] == 'her']['correct'] == '1') +
397 |                             sum(m_data[m_data['g'] == 'she']['correct'] == '1')) / \
398 |                            (sum(f_data['g'] == 'he') + sum(f_data['g'] == 'his') +
399 |                             sum(m_data['g'] == 'her') + sum(m_data['g'] == 'she')),
400 |         "non-s\n male": (sum(f_data[f_data['g'] == 'he']['correct'] == '1') +
401 |                          sum(f_data[f_data['g'] == 'his']['correct'] == '1')) / \
402 |                         (sum(f_data['g'] == 'he') + sum(f_data['g'] == 'his')),
403 |         "non-s\n female": (sum(m_data[m_data['g'] == 'her']['correct'] == '1') +
404 |                            sum(m_data[m_data['g'] == 'she']['correct'] == '1')) / \
405 |                           (sum(m_data['g'] == 'her') + sum(m_data['g'] == 'she'))
406 |     }
407 |     plot_dict(statistics_stereotype, "orange", "Statistics - Stereotypes")
408 | 
409 |     statistics_num_of_pronouns = {
410 |         "1": (sum(data[data['num_of_pronouns'] == 1]['correct'] == '1')) / \
411 |              (sum(data['num_of_pronouns'] == 1)),  # 146
412 |         "2": (sum(data[data['num_of_pronouns'] == 2]['correct'] == '1')) / \
413 |              (sum(data['num_of_pronouns'] == 2)),  # 81
414 |         "3": (sum(data[data['num_of_pronouns'] == 3]['correct'] == '1')) / \
415 |              (sum(data['num_of_pronouns'] == 3)),  # 47
416 |         "4": (sum(data[data['num_of_pronouns'] == 4]['correct'] == '1')) / \
417 |              (sum(data['num_of_pronouns'] == 4)),  # 20
418 |         # "more then 4": (sum(data[data['num_of_pronouns'] > 4]['correct'] == '1')) / \
419 |         #                 (sum(data['num_of_pronouns'] > 4))  # 6
420 |     }
421 |     plot_dict(statistics_num_of_pronouns, "maroon", "Statistics - number of pronouns")
422 | 
423 |     count_corpus = {
424 |         "wikipedia": (sum(data[data['corpus'] == 'wikipedia']['correct'] == '1')) / sum(data['corpus'] == 'wikipedia'),
425 |         "covid19": (sum(data[data['corpus'] == 'covid19']['correct'] == '1')) / sum(data['corpus'] == 'covid19'),
426 |         # "perseus": (sum(data[data['corpus'] == 'perseus']['correct'] == '1')) / sum(data['corpus'] == 'perseus'),
427 |         "pubmed": (sum(data[data['corpus'] == 'pubmed']['correct'] == '1')) / sum(data['corpus'] == 'pubmed'),
428 |     }
429 |     plot_dict(count_corpus, "g", "Statistics - statistics_count_corpus")
430 | 
431 | 
432 | def statistics_over_corpus(data_path):
433 |     """
434 |     plot statistics over all sentences.
435 |     numbers over histograms are:
436 |         number of all samples with attribute
437 |     comment out code lines are data with less quality.
438 |     """
439 |     data = pd.read_csv(data_path)
440 |     data['g'] = data['g'].apply(lambda x: x.lower())
441 |     count_gender_pronouns = {
442 |         "herself": sum(data['g'] == 'herself'),
443 |         "she": sum(data['g'] == 'she'),
444 |         "himself": sum(data['g'] == 'himself'),
445 |         "he": sum(data['g'] == 'he'),
446 |         "her": sum(data['g'] == 'her'),
447 |         "his": sum(data['g'] == 'his'),
448 |         # "female": sum(data['g'] == 'her') + sum(data['g'] == 'she') + sum(data['g'] == 'herself'),
449 |         # "male": sum(data['g'] == 'he') + sum(data['g'] == 'his') + sum(data['g'] == 'him') + sum(data['g'] == 'himself')
450 |     }
451 |     plot_dict(count_gender_pronouns, "pink", "count - Gender Pronouns")
452 |     print(count_gender_pronouns)
453 | 
454 |     m_data = data[data['profession'].isin(m_entities)]
455 |     f_data = data[data['profession'].isin(f_entities)]
456 |     n_data = data[data['profession'].isin(n_entities)]
457 | 
458 |     count_male_female = {
459 |         "male\n entities": m_data.shape[0],
460 |         "female\n entities": f_data.shape[0],
461 |         "neutral\n entities": n_data.shape[0]
462 |     }
463 |     plot_dict(count_male_female, "darkmagenta", "count - Male, Female Entities")
464 | 
465 |     count_stereotype = {
466 |         "stereotypes": sum(data['stereotype'] == 1),
467 |         "non-stereotypes": sum(data['stereotype'] == -1),
468 |     }
469 |     plot_dict(count_stereotype, "orange", "count - Stereotypes")
470 | 
471 |     count_data_index = {
472 |         "1": sum(data['data_index'] == '1'),
473 |         "2": sum(data['data_index'] == '2'),
474 |         "3": sum(data['data_index'] == '3'),
475 |         "4": sum(data['data_index'] == '4'),
476 |         "5": sum(data['data_index'] == '5'),
477 |         # "6": sum(data['data_index'] == '6'),
478 |         # "7": sum(data['data_index'] == '7'),
479 |         # "8": sum(data['data_index'] == '8'),
480 |         "9": sum(data['data_index'] == '9'),
481 |         "10": sum(data['data_index'] == '10'),
482 |         "11": sum(data['data_index'] == '11'),
483 |         # "12": sum(data['data_index'] == '12'),
484 |         # "13": sum(data['data_index'] == '13'),
485 |         "14": sum(data['data_index'] == '14'),
486 |         "15": sum(data['data_index'] == '15'),
487 |         # "16": sum(data['data_index'] == '16'),
488 |         "17": sum(data['data_index'] == '17'),
489 |         "18": sum(data['data_index'] == '18'),
490 |         # "19": sum(data['data_index'] == '19'),
491 |         "20": sum(data['data_index'] == '20'),
492 |         "21": sum(data['data_index'] == '21'),
493 |     }
494 |     plot_dict(count_data_index, "gold", "count - statistics_data_index")
495 | 
496 |     count_corpus = {
497 |         "wikipedia": sum(data['corpus'] == 'wikipedia'),
498 |         "covid19": sum(data['corpus'] == 'covid19'),
499 |         # "perseus": sum(data['corpus'] == 'perseus'),
500 |         "pubmed": sum(data['corpus'] == 'pubmed'),
501 |         # "ungd": sum(data['corpus'] == 'ungd'),
502 |     }
503 |     plot_dict(count_corpus, "g", "count - statistics_count_corpus")
504 | 
505 |     data['num_of_pronouns'] = data[['sentence_text']].apply(num_of_pronouns, axis=1)
506 |     count_num_of_pronouns = {
507 |         "1": sum(data['num_of_pronouns'] == 1),
508 |         "2": sum(data['num_of_pronouns'] == 2),
509 |         "3": sum(data['num_of_pronouns'] == 3),
510 |         "4": sum(data['num_of_pronouns'] == 4),
511 |         "more then 4": sum(data['num_of_pronouns'] > 4),
512 |     }
513 |     plot_dict(count_num_of_pronouns, "maroon", "count - number of pronouns")
514 | 
515 |     data['distance'] = data[['g_first_index', 'profession_first_index']].apply(find_distance, axis=1)
516 | 
517 |     count_distance = {
518 |         "d <= 5": sum(data['distance'] <= 5),
519 |         "5 < d <= 10": (sum(data['distance'] <= 10) - sum(data['distance'] <= 5)),
520 |         "10 < d <= 15": (sum(data['distance'] <= 15) - sum(data['distance'] <= 10)),
521 |         "15 < d <=20": (sum(data['distance'] <= 20) - sum(data['distance'] <= 15)),
522 |         "20 < d": sum(data['distance'] > 20)
523 |     }
524 |     plot_dict(count_distance, "teal", "count - Distance Between Words")
525 | 
526 | 
527 | if __name__ == '__main__':
528 |     create_lists("inc_occ_gender.csv")
529 |     filtered_data = create_BUG(sys.argv[1])
530 | 
531 |     data_filtered = "data\\full_BUG.csv"
532 |     # statistics_over_corpus(data_filtered)
533 | 
534 | 


--------------------------------------------------------------------------------
/src/evaluations/dataset_stats.py:
--------------------------------------------------------------------------------
 1 | """ Usage:
 2 |     <file-name> [--in=INPUT_FILE] [--out=OUTPUT_FILE] [--debug]
 3 | 
 4 | Options:
 5 |   --help                           Show this message and exit
 6 |   -i INPUT_FILE --in=INPUT_FILE    Input file
 7 |                                    [default: ../data/full_bug.csv]
 8 |   -o INPUT_FILE --out=OUTPUT_FILE  Input file
 9 |                                    [default: outfile.tmp]
10 |   --debug                          Whether to debug
11 | """
12 | # External imports
13 | import logging
14 | import pdb
15 | from pprint import pprint
16 | from pprint import pformat
17 | from docopt import docopt
18 | from pathlib import Path
19 | from tqdm import tqdm
20 | import numpy as np
21 | import json
22 | import pandas as pd
23 | import numpy as np
24 | 
25 | 
26 | # Local imports
27 | 
28 | 
29 | #----
30 | 
31 | def dist_between_prof_ant(df):
32 |     """
33 |     Return average dist and std between profession and
34 |     antecedent in the given data frame.
35 |     """
36 |     dist = np.abs(df.profession_first_index - df.g_first_index)
37 |     ave = np.average(dist)
38 |     std = np.std(dist)
39 |     dist_dict = {"average": ave,
40 |                  "std": std}
41 |     return dist_dict
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     # Parse command line arguments
46 |     args = docopt(__doc__)
47 |     inp_fn = Path(args["--in"])
48 |     out_fn = Path(args["--out"])
49 | 
50 |     # Determine logging level
51 |     debug = args["--debug"]
52 |     if debug:
53 |         logging.basicConfig(level = logging.DEBUG)
54 |     else:
55 |         logging.basicConfig(level = logging.INFO)
56 | 
57 |     logging.info(f"Input file: {inp_fn}, Output file: {out_fn}.")
58 | 
59 |     # Load data
60 |     df = pd.read_csv(inp_fn)
61 | 
62 |     # Compute distance between pronoun and antecedent between
63 |     # dataset partitions
64 |     ste = df[df.stereotype == 1]
65 |     ant = df[df.stereotype == -1]
66 | 
67 |     ste_dist = dist_between_prof_ant(ste)
68 |     ant_dist = dist_between_prof_ant(ant)
69 |     
70 |     logging.info("stereotype dist: {}".format(pformat(ste_dist)))
71 |     logging.info("anti-stereotype dist: {}".format(pformat(ant_dist)))
72 | 
73 |     
74 |     # End
75 |     logging.info("DONE")
76 | 


--------------------------------------------------------------------------------
/src/evaluations/evaluate_coref.py:
--------------------------------------------------------------------------------
  1 | """ Usage:
  2 |     <file-name> [--in=INPUT_FILE] [--out=OUTPUT_FILE] [--debug]
  3 | 
  4 | Options:
  5 |   --help                           Show this message and exit
  6 |   -i INPUT_FILE --in=INPUT_FILE    Input file
  7 |                                    [default: ../../predictions/coref_preds.jsonl]
  8 |   -o INPUT_FILE --out=OUTPUT_FILE  Input file
  9 |                                    [default: ../../visualizations/delta_s_by_dist.png]
 10 |   --debug                          Whether to debug
 11 | """
 12 | # External imports
 13 | import logging
 14 | import pdb
 15 | from pprint import pprint
 16 | from pprint import pformat
 17 | from docopt import docopt
 18 | from pathlib import Path
 19 | from tqdm import tqdm
 20 | import numpy as np
 21 | import json
 22 | from collections import defaultdict
 23 | from math import floor
 24 | import matplotlib.pyplot as plt
 25 | from operator import itemgetter
 26 | 
 27 | # Local imports
 28 | 
 29 | 
 30 | #----
 31 | 
 32 | BIN_SIZE = 5
 33 | 
 34 | def find_cluster_ind(clusters, word_ind):
 35 |     """
 36 |     find the cluster ind for the given word, or -1 if not found
 37 |     """
 38 |     found_in_clusters = []
 39 |     for cluster_ind, cluster in enumerate(clusters):
 40 |         for ent_ind, ent in enumerate(cluster):
 41 |             ent_start, ent_end = ent
 42 |             if (word_ind >= ent_start) and (word_ind <= ent_end):
 43 |                 # found a cluster
 44 |                 found_in_clusters.append(cluster_ind)
 45 | 
 46 |     # no cluster found
 47 |     return found_in_clusters
 48 | 
 49 | 
 50 | def is_correct_pred(line):
 51 |     """
 52 |     return True iff this line represents a correct prediction.
 53 |     """
 54 |     pred = line["pred"]
 55 |     row = line["row"]
 56 |     clusters = pred["clusters"]
 57 |     ent_id = find_cluster_ind(clusters, row["profession_first_index"])
 58 |     pron_id = find_cluster_ind(clusters, row["g_first_index"])
 59 |     
 60 |     # prediction is correct if it assigns pronoun and entity to the same cluster
 61 |     is_correct = len(set(ent_id).intersection(pron_id))
 62 |     
 63 |     return is_correct
 64 | 
 65 | 
 66 | def get_acc(vals):
 67 |     """
 68 |     return the accuracy given binary scores
 69 |     """
 70 |     acc = (sum(vals) / len(vals)) * 100
 71 |     return acc
 72 | 
 73 | def get_delta_s_by_dist(dist_metric):
 74 |     """
 75 |     Compute delta s by distance
 76 |     """
 77 |     dists = sorted(dist_metric.keys())
 78 |     delta_s_by_dist = []
 79 |     for dist in dists:
 80 |         cur_metric = dist_metric[dist]
 81 |         ste = cur_metric[1]
 82 |         ant = cur_metric[-1]
 83 |         delta_s_by_dist.append((dist,
 84 |                                 get_acc(ste) - get_acc(ant),
 85 |                                 len(ste) + len(ant)))
 86 |     return delta_s_by_dist
 87 | 
 88 | def get_simple_metric_by_dist(dist_metric, metric_name):
 89 |     """
 90 |     Aggregate over a given metric by dist.
 91 |     """
 92 |     dists = sorted(dist_metric.keys())
 93 |     met_by_dist = []
 94 |     for dist in dists:
 95 |         met_by_dist.append(get_acc(dist_metric[dist][metric_name]))
 96 |     return met_by_dist
 97 | 
 98 | # Simple instantiations
 99 | get_acc_by_dist = lambda dist_metric: get_simple_metric_by_dist(dist_metric, "acc")
100 | get_ste_by_dist = lambda dist_metric: get_simple_metric_by_dist(dist_metric, 1)
101 | get_ant_by_dist = lambda dist_metric: get_simple_metric_by_dist(dist_metric, -1)
102 | 
103 | def average_buckets(b1, b2):
104 |     """
105 |     average two buckets
106 |     """
107 |     b1_ind, b1_val, b1_cnt = b1
108 |     b2_ind, b2_val, b2_cnt = b2
109 | 
110 |     cnt = b1_cnt + b2_cnt
111 |     val = ((b1_val * b1_cnt) + (b2_val * b2_cnt)) / cnt
112 | 
113 |     new_bucket = (b1_ind, val, cnt)
114 |     return new_bucket
115 |     
116 | if __name__ == "__main__":
117 |     # Parse command line arguments
118 |     args = docopt(__doc__)
119 |     inp_fn = Path(args["--in"])
120 |     out_fn = Path(args["--out"])
121 | 
122 |     # Determine logging level
123 |     debug = args["--debug"]
124 |     if debug:
125 |         logging.basicConfig(level = logging.DEBUG)
126 |     else:
127 |         logging.basicConfig(level = logging.INFO)
128 | 
129 |     logging.info(f"Input file: {inp_fn}, Output file: {out_fn}.")
130 | 
131 |     # Start computation
132 |     metrics = {"acc": [],
133 |                "ste": [],
134 |                "ant": [],
135 |                "masc": [],
136 |                "femn": [],
137 |                "num_of_pronouns": defaultdict(list),
138 |                "distance": defaultdict(lambda: defaultdict(list))}
139 | 
140 |     for line in tqdm(open(inp_fn, encoding = "utf8")):
141 |         line = json.loads(line.strip())
142 |         is_correct = is_correct_pred(line)
143 |         row = line["row"]
144 |         metrics["acc"].append(is_correct)
145 |         gender = row["predicted gender"].lower()
146 |         stereotype = row["stereotype"]
147 |         
148 |         if gender == "male":
149 |             metrics["masc"].append(is_correct)
150 |         elif gender == "female":
151 |             metrics["femn"].append(is_correct)
152 | 
153 |         if stereotype == 1:
154 |             metrics["ste"].append(is_correct)
155 |         elif stereotype == -1:
156 |             metrics["ant"].append(is_correct)
157 | 
158 |         num_of_prons = row["num_of_pronouns"]
159 |         dist = floor(row["distance"] / BIN_SIZE)
160 |         metrics["num_of_pronouns"][num_of_prons].append(is_correct)
161 |         metrics["distance"][dist][stereotype].append(is_correct)
162 |         metrics["distance"][dist]["acc"].append(is_correct)
163 |             
164 |         
165 |     acc = get_acc(metrics["acc"])
166 |     delta_g = get_acc(metrics["masc"]) - get_acc(metrics["femn"])
167 |     delta_s = get_acc(metrics["ste"]) - get_acc(metrics["ant"])
168 |     delta_s_by_dist = get_delta_s_by_dist(metrics["distance"])
169 |     acc_by_dist = get_acc_by_dist(metrics["distance"])
170 |     ste_by_dist = get_ste_by_dist(metrics["distance"])
171 |     ant_by_dist = get_ant_by_dist(metrics["distance"])
172 | 
173 |     # average last bucket
174 |     # delta_s_by_dist[-2] = average_buckets(delta_s_by_dist[-2], delta_s_by_dist[-1])
175 |     # delta_s_by_dist = delta_s_by_dist[:-1]
176 | 
177 |     logging.info(f"acc = {acc:.1f}; delta_g = {delta_g:.1f}; delta_s = {delta_s:.1f}")
178 |     logging.info(f"delta s by dist = {delta_s_by_dist}")
179 |     logging.info(f"acc by dist = {acc_by_dist}")
180 | 
181 |     # plot
182 |     plt.rcParams.update({'font.size': 15})
183 |     ranges = [f"{(x*5) + 1}-{(x +1) * 5}" for x in  map(itemgetter(0), delta_s_by_dist)]
184 |     ranges[-1] = ranges[-1].split("-")[0] + ">"
185 |     values_ds = list(map(itemgetter(1), delta_s_by_dist))
186 |     y_pos = np.arange(len(ranges))
187 |     width = 1 # the width for the bars
188 | 
189 |     plt.plot(y_pos, ste_by_dist, label = "stereotypical",
190 |              color = "orange", linestyle = "dashed")
191 |     plt.scatter(y_pos, ste_by_dist, color = "orange")
192 | 
193 |     plt.plot(y_pos, ant_by_dist, label = "anti-stereotypical",
194 |              color = "blue", linestyle = "dotted")
195 |     plt.scatter(y_pos, ant_by_dist, color = "blue")
196 | 
197 |     plt.xticks(y_pos, ranges)
198 |     plt.ylabel("coreference acc")
199 |     plt.xlabel("distance [words] between pronoun and antecedent")
200 |     plt.legend()
201 |     plt.tight_layout()
202 |     plt.savefig(out_fn)
203 | 
204 |     
205 |     # End
206 |     logging.info("DONE")
207 | 


--------------------------------------------------------------------------------
/src/evaluations/inc_occ_gender.csv:
--------------------------------------------------------------------------------
  1 | Occupation,All_workers,All_weekly,M_workers,M_weekly,F_workers,F_weekly,entities
  2 | ALL OCCUPATIONS,109080,809,60746,895,48334,726,
  3 | MANAGEMENT,12480,1351,7332,1486,5147,1139,"manager, boss, principal, executive, headmaster"
  4 | Chief executives,1046,2041,763,2251,283,1836,Chief
  5 | General and operations managers,823,1260,621,1347,202,1002,
  6 | Legislators,8,Na,5,Na,4,Na,Legislator
  7 | Advertising and promotions managers,55,1050,29,Na,26,Na,
  8 | Marketing and sales managers,948,1462,570,1603,378,1258,
  9 | Public relations and fundraising managers,59,1557,24,Na,35,Na,
 10 | Administrative services managers,170,1191,96,1451,73,981,
 11 | Computer and information systems managers,636,1728,466,1817,169,1563,
 12 | Financial managers,1124,1408,551,1732,573,1130,
 13 | Compensation and benefits managers,23,Na,7,Na,16,Na,
 14 | Human resources managers,254,1365,68,1495,186,1274,
 15 | Training and development managers,37,Na,17,Na,20,Na,
 16 | Industrial production managers,267,1485,221,1528,45,Na,
 17 | Purchasing managers,193,1348,109,1404,84,1226,
 18 | "Transportation, storage, and distribution managers",276,966,224,1006,52,749,
 19 | "Farmers, ranchers, and other agricultural managers",129,769,106,847,23,Na,"Agricultural, Farmer, agriculturist, rancher"
 20 | Construction managers,471,1329,429,1357,42,Na,
 21 | Education administrators,778,1423,282,1585,496,1252,administrator
 22 | Architectural and engineering managers,110,1899,101,1892,10,Na,architect
 23 | Food service managers,763,742,389,820,374,680,
 24 | Funeral service managers,13,Na,10,Na,2,Na,
 25 | Gaming managers,19,Na,13,Na,6,Na,
 26 | Lodging managers,123,985,54,1171,68,902,
 27 | Medical and health services managers,592,1210,154,1422,438,1156,
 28 | Natural sciences managers,24,Na,11,Na,13,Na,
 29 | Postmasters and mail superintendents,20,Na,10,Na,10,Na,"Postmaster, superintendent"
 30 | "Property, real estate, and community association managers",401,914,171,1137,230,823,realtor
 31 | Social and community service managers,305,1022,105,1142,200,965,
 32 | Emergency management directors,9,Na,6,Na,3,Na,
 33 | "Managers, all other",2803,1408,1717,1525,1085,1213,
 34 | BUSINESS,5942,1137,2686,1327,3256,1004,
 35 | "Agents and business managers of artists, performers, and athletes",27,Na,13,Na,14,Na,
 36 | "Buyers and purchasing agents, farm products",11,Na,9,Na,2,Na,
 37 | "Wholesale and retail buyers, except farm products",142,926,73,886,69,985,
 38 | "Purchasing agents, except wholesale, retail, and farm products",260,1009,136,1020,124,986,
 39 | "Claims adjusters, appraisers, examiners, and investigators",317,963,141,1134,176,824,"Appraiser, investigator, examiner"
 40 | Compliance officers,235,1198,126,1375,109,1025,
 41 | Cost estimators,95,1232,83,1264,12,Na,
 42 | Human resources workers,592,1002,151,1158,441,984,
 43 | "Compensation, benefits, and job analysis specialists",63,998,12,Na,50,898,
 44 | Training and development specialists,107,990,42,Na,65,1037,
 45 | Logisticians,111,1028,66,1075,44,Na,logistician
 46 | Management analysts,529,1431,291,1519,237,1348,analyst
 47 | "Meeting, convention, and event planners",117,859,27,Na,90,840,planner
 48 | Fundraisers,62,1136,14,Na,48,Na,Fundraiser
 49 | Market research analysts and marketing specialists,203,1284,85,1411,118,1239,
 50 | "Business operations specialists, all other",186,1090,74,1461,112,969,
 51 | Accountants and auditors,1464,1132,618,1345,846,988,"auditor, Accountant"
 52 | Appraisers and assessors of real estate,42,Na,21,Na,21,Na,
 53 | Budget analysts,44,Na,17,Na,28,Na,
 54 | Credit analysts,17,Na,8,Na,9,Na,
 55 | Financial analysts,295,1426,168,1680,127,1171,
 56 | Personal financial advisors,407,1419,248,1738,159,1033,advisor
 57 | Insurance underwriters,106,1149,44,Na,63,956,"insurer, underwriter, stamper, sealer"
 58 | Financial examiners,17,Na,9,Na,8,Na,
 59 | Credit counselors and loan officers,313,997,146,1186,166,906,Counselor
 60 | "Tax examiners and collectors, and revenue agents",59,1051,20,Na,39,Na,
 61 | Tax preparers,56,892,19,Na,37,Na,
 62 | "Financial specialists, all other",66,1162,25,Na,40,Na,specialist
 63 | COMPUTATIONAL,4009,1428,3036,1503,973,1245,
 64 | Computer and information research scientists,28,Na,23,Na,5,Na,
 65 | Computer systems analysts,499,1389,325,1462,173,1256,
 66 | Information security analysts,67,1538,56,1562,11,Na,
 67 | Computer programmers,450,1438,357,1501,93,1302,programmer
 68 | "Software developers, applications and systems software",1287,1682,1054,1751,232,1415,developer
 69 | Web developers,151,1165,98,1233,53,1026,
 70 | Computer support specialists,396,1079,291,1135,105,908,
 71 | Database administrators,90,1536,58,1829,32,Na,
 72 | Network and computer systems administrators,208,1242,179,1266,28,Na,
 73 | Computer network architects,115,1552,100,1577,15,Na,
 74 | "Computer occupations, all other",490,1227,374,1252,116,1145,
 75 | Actuaries,24,Na,18,Na,6,Na,actuary
 76 | Mathematicians,6,Na,6,Na,0,Na,Mathematician
 77 | Operations research analysts,122,1441,59,1574,63,1325,
 78 | Statisticians,76,1275,37,Na,39,Na,Statistician
 79 | Miscellaneous mathematical science occupations,1,Na,1,Na,0,Na,
 80 | ENGINEERING,2656,1424,2272,1452,383,1257,engineer
 81 | "Architects, except naval",138,1441,106,1492,31,Na,
 82 | "Surveyors, cartographers, and photogrammetrists",29,Na,23,Na,6,Na,"photogrammetrist, cartographer, Surveyor"
 83 | Aerospace engineers,140,1662,122,1668,18,Na,
 84 | Agricultural engineers,5,Na,5,Na,0,Na,
 85 | Biomedical engineers,12,Na,10,Na,2,Na,
 86 | Chemical engineers,79,1532,69,1583,10,Na,
 87 | Civil engineers,316,1460,275,1474,41,Na,
 88 | Computer hardware engineers,72,1876,62,1871,10,Na,
 89 | Electrical and electronics engineers,283,1778,246,1819,37,Na,
 90 | Environmental engineers,35,Na,26,Na,9,Na,
 91 | "Industrial engineers, including health and safety",205,1447,168,1430,37,Na,
 92 | Marine engineers and naval architects,9,Na,9,Na,0,Na,
 93 | Materials engineers,36,Na,33,Na,4,Na,
 94 | Mechanical engineers,316,1534,294,1550,23,Na,mechanic
 95 | "Mining and geological engineers, including mining safety engineers",15,Na,15,Na,0,Na,geologist
 96 | Nuclear engineers,5,Na,3,Na,2,Na,
 97 | Petroleum engineers,43,Na,39,Na,3,Na,
 98 | "Engineers, all other",393,1527,339,1537,54,1448,
 99 | Drafters,114,977,91,977,23,Na,Drafter
100 | "Engineering technicians, except drafters",352,963,284,984,68,827,technician
101 | Surveying and mapping technicians,58,1012,54,1031,4,Na,
102 | SCIENCE,1176,1206,662,1379,514,1067,
103 | Agricultural and food scientists,22,Na,12,Na,10,Na,
104 | Biological scientists,74,1233,46,Na,28,Na,biologist
105 | Conservation scientists and foresters,23,Na,16,Na,7,Na,
106 | Medical scientists,151,1250,68,1362,84,1082,
107 | "Life scientists, all other",1,Na,1,Na,0,Na,
108 | Astronomers and physicists,14,Na,11,Na,3,Na,"physicist, Astronomer"
109 | Atmospheric and space scientists,8,Na,8,Na,1,Na,"Atmospheric, astronaut"
110 | Chemists and materials scientists,93,1432,61,1496,33,Na,Chemist
111 | Environmental scientists and geoscientists,90,1423,65,1740,25,Na,geoscientist
112 | "Physical scientists, all other",189,1553,121,1770,68,1170,scientist
113 | Economists,29,Na,17,Na,12,Na,Economist
114 | Survey researchers,0,Na,0,Na,0,Na,
115 | Psychologists,114,1367,31,Na,83,1189,"Psychologist, shrink"
116 | Sociologists,0,Na,0,Na,0,Na,Sociologist
117 | Urban and regional planners,22,Na,13,Na,9,Na,
118 | Miscellaneous social scientists and related workers,37,Na,19,Na,19,Na,
119 | Agricultural and food science technicians,28,Na,21,Na,7,Na,
120 | Biological technicians,20,Na,10,Na,10,Na,
121 | Chemical technicians,75,944,43,Na,32,Na,
122 | Geological and petroleum technicians,22,Na,18,Na,4,Na,
123 | Nuclear technicians,2,Na,2,Na,0,Na,
124 | Social science research assistants,3,Na,0,Na,3,Na,
125 | "Miscellaneous life, physical, and social science technicians",157,846,79,1001,78,780,
126 | SOCIAL SERVICE,2143,889,776,973,1367,845,
127 | Counselors,635,904,184,908,451,902,Counselor
128 | Social workers,677,877,127,943,549,862,
129 | Probation officers and correctional treatment specialists,85,967,42,Na,43,Na,
130 | Social and human service assistants,173,676,23,Na,149,673,
131 | "Miscellaneous community and social service specialists, including health educators and community health workers",92,831,29,Na,63,728,
132 | Clergy,376,1002,316,1021,60,924,"clergy, priest"
133 | "Directors, religious activities and education",62,929,31,Na,31,Na,
134 | "Religious workers, all other",44,Na,23,Na,21,Na,
135 | LEGAL,1346,1391,624,1877,722,1135,
136 | Lawyers,803,1886,503,1914,300,1717,Lawyer
137 | Judicial law clerks,11,Na,1,Na,10,Na,
138 | "Judges, magistrates, and other judicial workers",54,1952,33,Na,20,Na,"Judge, magistrate"
139 | Paralegals and legal assistants,341,927,47,Na,294,910,
140 | Miscellaneous legal support workers,136,770,40,Na,97,746,
141 | EDUCATION,6884,956,1849,1144,5034,907,
142 | Postsecondary teachers,917,1258,516,1405,401,1144,
143 | Preschool and kindergarten teachers,517,616,11,Na,506,618,
144 | Elementary and middle school teachers,2806,974,543,1077,2262,957,"teacher, educator"
145 | Secondary school teachers,1048,1066,438,1149,610,1006,
146 | Special education teachers,297,987,38,Na,258,990,
147 | Other teachers and instructors,378,896,179,1024,199,817,instructor
148 | "Archivists, curators, and museum technicians",38,Na,18,Na,20,Na,
149 | Librarians,130,991,27,Na,102,966,Librarian
150 | Library technicians,18,Na,4,Na,15,Na,
151 | Teacher assistants,614,541,48,Na,565,530,assistant
152 | "Other education, training, and library workers",123,1001,28,Na,95,1031,
153 | ARTS,1643,1001,930,1088,713,942,
154 | Artists and related workers,58,1166,39,Na,20,Na,Artist
155 | Designers,593,993,301,1099,291,918,Designer
156 | Actors,12,Na,8,Na,4,Na,
157 | Producers and directors,120,1270,67,1340,53,1234,"Producer, director"
158 | "Athletes, coaches, umpires, and related workers",147,780,108,818,39,Na,"Athlete, coach, umpire"
159 | Dancers and choreographers,11,Na,2,Na,9,Na,"Dancer, choreographer"
160 | "Musicians, singers, and related workers",42,Na,33,Na,9,Na,"Musician, singer, deejay, dj, Guitarist, Bassist, drummer"
161 | "Entertainers and performers, sports and related workers, all other",14,Na,11,Na,3,Na,Entertainer
162 | Announcers,25,Na,21,Na,5,Na,Announcer
163 | "News analysts, reporters and correspondents",56,1218,29,Na,27,Na,reporter
164 | Public relations specialists,120,1211,49,Na,71,971,
165 | Editors,108,1148,58,1205,50,1125,Editor
166 | Technical writers,52,1158,22,Na,30,Na,
167 | Writers and authors,79,1232,36,Na,42,Na,"Writer, author, journalist"
168 | Miscellaneous media and communication workers,46,Na,16,Na,30,Na,
169 | Broadcast and sound engineering technicians and radio operators,83,954,77,937,7,Na,
170 | Photographers,45,Na,24,Na,21,Na,Photographer
171 | "Television, video, and motion picture camera operators and editors",30,Na,28,Na,1,Na,videographer
172 | "Media and communication equipment workers, all other",2,Na,2,Na,0,Na,
173 | HEALTHCARE PROFESSIONAL,6566,1041,1639,1272,4928,991,
174 | Chiropractors,22,Na,15,Na,7,Na,Chiropractor
175 | Dentists,59,1656,39,Na,20,Na,Dentist
176 | Dietitians and nutritionists,79,886,9,Na,69,879,"Dietitian, nutritionist, dietician, Naturopath"
177 | Optometrists,19,Na,6,Na,13,Na,Optometrist
178 | Pharmacists,206,1920,98,2117,108,1811,Pharmacist
179 | Physicians and surgeons,740,1824,457,1915,283,1533,"Physician, surgeon, doctor, anesthesiologist"
180 | Physician assistants,57,1368,17,Na,40,Na,
181 | Podiatrists,9,Na,5,Na,4,Na,Podiatrist
182 | Audiologists,8,Na,1,Na,6,Na,Audiologist
183 | Occupational therapists,74,1210,10,Na,64,1199,therapist
184 | Physical therapists,178,1265,56,1347,123,1215,
185 | Radiation therapists,12,Na,5,Na,7,Na,
186 | Recreational therapists,6,Na,2,Na,4,Na,
187 | Respiratory therapists,99,1000,32,Na,67,937,
188 | Speech-language pathologists,108,1147,1,Na,106,1148,pathologist
189 | Exercise physiologists,3,Na,3,Na,0,Na,physiologist
190 | "Therapists, all other",132,944,31,Na,101,951,
191 | Veterinarians,55,1455,16,Na,39,Na,Veterinarian
192 | Registered nurses,2382,1116,278,1222,2104,1098,nurse
193 | Nurse anesthetists,23,Na,11,Na,12,Na,
194 | Nurse midwives,6,Na,0,Na,6,Na,
195 | Nurse practitioners,115,1532,11,Na,103,1522,
196 | "Health diagnosing and treating practitioners, all other",2,Na,0,Na,2,Na,
197 | Clinical laboratory technologists and technicians,270,901,69,1089,201,796,laborer
198 | Dental hygienists,86,914,6,Na,80,953,
199 | Diagnostic related technologists and technicians,253,964,76,1106,177,908,
200 | Emergency medical technicians and paramedics,175,811,126,899,49,Na,
201 | Health practitioner support technologists and technicians,487,636,99,652,389,633,
202 | Licensed practical and licensed vocational nurses,508,743,48,Na,459,737,
203 | Medical records and health information technicians,174,740,17,Na,157,723,
204 | "Opticians, dispensing",44,Na,21,Na,23,Na,
205 | Miscellaneous health technologists and technicians,99,671,32,Na,66,642,
206 | Other healthcare practitioners and technical occupations,78,1128,40,Na,38,Na,
207 | HEALTHCARE SUPPORT,2395,498,320,577,2074,490,
208 | "Nursing, psychiatric, and home health aides",1400,467,163,526,1237,457,psychiatric
209 | Occupational therapy assistants and aides,17,Na,4,Na,12,Na,
210 | Physical therapist assistants and aides,53,742,18,Na,35,Na,
211 | Massage therapists,37,Na,16,Na,22,Na,
212 | Dental assistants,188,531,14,Na,175,522,
213 | Medical assistants,422,539,35,Na,387,530,
214 | Medical transcriptionists,28,Na,3,Na,26,Na,
215 | Pharmacy aides,22,Na,4,Na,18,Na,
216 | Veterinary assistants and laboratory animal caretakers,21,Na,4,Na,17,Na,
217 | Phlebotomists,91,551,15,Na,76,534,Phlebotomist
218 | "Miscellaneous healthcare support occupations, including medical equipment preparers",115,524,44,Na,71,511,
219 | PROTECTIVE SERVICE,2729,796,2181,851,547,655,
220 | First-line supervisors of correctional officers,57,856,41,Na,16,Na,
221 | First-line supervisors of police and detectives,114,1427,97,1425,17,Na,detective
222 | First-line supervisors of fire fighting and prevention workers,42,Na,39,Na,3,Na,
223 | "First-line supervisors of protective service workers, all other",72,808,56,825,16,Na,
224 | Firefighters,260,1033,245,1052,16,Na,Firefighter
225 | Fire inspectors,18,Na,15,Na,3,Na,
226 | "Bailiffs, correctional officers, and jailers",453,754,341,779,112,686,"Bailiff, jailer, cop"
227 | Detectives and criminal investigators,141,1159,100,1265,41,Na,
228 | Fish and game wardens,6,Na,5,Na,1,Na,
229 | Parking enforcement workers,9,Na,6,Na,3,Na,
230 | Police and sheriff's patrol officers,655,1002,569,1001,86,1009,"officer, equerry"
231 | Transit and railroad police,3,Na,3,Na,0,Na,
232 | Animal control workers,4,Na,2,Na,2,Na,
233 | Private detectives and investigators,85,843,49,Na,36,Na,
234 | Security guards and gaming surveillance officers,708,567,555,592,153,515,guard
235 | Crossing guards,26,Na,13,Na,13,Na,
236 | Transportation security screeners,33,Na,22,Na,11,Na,
237 | "Lifeguards and other recreational, and all other protective service workers",42,Na,24,Na,18,Na,
238 | CULINARY,4124,441,2133,481,1991,414,
239 | Chefs and head cooks,340,619,285,656,55,492,Chef
240 | First-line supervisors of food preparation and serving workers,378,498,156,621,222,458,
241 | Cooks,1302,416,808,427,494,400,cook
242 | Food preparation workers,366,402,174,414,192,388,
243 | Bartenders,252,521,110,569,142,493,Bartender
244 | "Combined food preparation and serving workers, including fast food",173,391,67,401,107,380,
245 | "Counter attendants, cafeteria, food concession, and coffee shop",56,354,28,Na,28,Na,attendant
246 | Waiters and waitresses,868,451,305,501,563,411,
247 | "Food servers, nonrestaurant",93,509,31,Na,62,485,server
248 | Dining room and cafeteria attendants and bartender helpers,107,403,59,389,48,Na,
249 | Dishwashers,117,398,98,401,19,Na,Dishwasher
250 | "Hosts and hostesses, restaurant, lounge, and coffee shop",66,400,8,Na,58,397,
251 | "Food preparation and serving related workers, all other",6,Na,4,Na,3,Na,
252 | GROUNDSKEEPING,3605,486,2330,517,1275,419,
253 | First-line supervisors of housekeeping and janitorial workers,172,620,108,700,64,571,
254 | "First-line supervisors of landscaping, lawn service, and groundskeeping workers",80,649,79,653,1,Na,
255 | Janitors and building cleaners,1536,507,1111,547,425,429,"janitor, cleaner"
256 | Maids and housekeeping cleaners,876,416,134,475,742,407,"Maid, housekeeper"
257 | Pest control workers,77,585,74,591,3,Na,
258 | Grounds maintenance workers,862,469,824,473,39,Na,
259 | SERVICE,2427,498,664,597,1763,475,
260 | First-line supervisors of gaming workers,117,769,65,900,52,680,
261 | First-line supervisors of personal service workers,60,608,25,Na,35,Na,
262 | Animal trainers,26,Na,19,Na,8,Na,trainer
263 | Nonfarm animal caretakers,100,505,32,Na,68,501,caretaker
264 | Gaming services workers,69,676,30,Na,39,Na,
265 | Motion picture projectionists,3,Na,3,Na,0,Na,
266 | "Ushers, lobby attendants, and ticket takers",9,Na,5,Na,4,Na,Usher
267 | Miscellaneous entertainment attendants and related workers,69,485,42,Na,28,Na,
268 | Embalmers and funeral attendants,2,Na,2,Na,0,Na,
269 | "Morticians, undertakers, and funeral directors",23,Na,17,Na,7,Na,"Mortician, undertaker"
270 | Barbers,46,Na,33,Na,12,Na,Barber
271 | "Hairdressers, hairstylists, and cosmetologists",253,461,23,Na,229,463,"Hairdresser, hairstylist, cosmetologist, beautician, cosmetician, esthetician"
272 | Miscellaneous personal appearance workers,191,501,32,Na,159,497,
273 | "Baggage porters, bellhops, and concierges",75,608,63,606,12,Na,"porter, bellhop, concierge"
274 | Tour and travel guides,21,Na,11,Na,9,Na,guide
275 | Childcare workers,407,437,14,Na,393,430,
276 | Personal care aides,680,462,128,537,552,441,
277 | Recreation and fitness workers,185,555,78,684,107,526,
278 | Residential advisors,24,Na,8,Na,16,Na,
279 | "Personal care and service workers, all other",67,499,35,Na,32,Na,
280 | SALES,9725,716,5423,880,4303,578,
281 | First-line supervisors of retail sales workers,2326,711,1296,825,1030,614,
282 | First-line supervisors of non-retail sales workers,835,1028,556,1140,280,896,
283 | Cashiers,1342,415,411,471,931,405,Cashier
284 | Counter and rental clerks,73,594,35,Na,37,Na,
285 | Parts salespersons,92,601,82,600,11,Na,salesperson
286 | Retail salespersons,1918,590,1159,694,759,494,
287 | Advertising sales agents,161,925,78,1155,83,729,
288 | Insurance sales agents,427,815,194,1028,232,717,
289 | "Securities, commodities, and financial services sales agents",211,1155,146,1461,65,767,
290 | Travel agents,62,711,9,Na,53,685,agent
291 | "Sales representatives, services, all other",406,966,268,1147,139,699,
292 | "Sales representatives, wholesale and manufacturing",1138,1020,843,1066,295,917,
293 | "Models, demonstrators, and product promoters",15,Na,4,Na,11,Na,
294 | Real estate brokers and sales agents,463,837,197,1052,266,735,
295 | Sales engineers,33,Na,31,Na,2,Na,
296 | Telemarketers,39,Na,17,Na,21,Na,Telemarketer
297 | "Door-to-door sales workers, news and street vendors, and related workers",28,Na,9,Na,18,Na,
298 | "Sales and related workers, all other",158,916,89,1088,70,727,
299 | OFFICE,13894,656,3961,693,9933,646,
300 | First-line supervisors of office and administrative support workers,1297,812,434,878,863,781,
301 | "Switchboard operators, including answering service",17,Na,7,Na,10,Na,
302 | Telephone operators,22,Na,4,Na,18,Na,
303 | "Communications equipment operators, all other",5,Na,2,Na,3,Na,
304 | Bill and account collectors,152,657,54,674,98,648,
305 | Billing and posting clerks,406,657,39,Na,366,664,
306 | "Bookkeeping, accounting, and auditing clerks",769,692,87,690,682,692,
307 | Gaming cage workers,11,Na,2,Na,10,Na,
308 | Payroll and timekeeping clerks,128,757,17,Na,111,751,
309 | Procurement clerks,35,Na,15,Na,21,Na,
310 | Tellers,264,514,33,Na,231,516,Teller
311 | "Financial clerks, all other",61,767,30,Na,32,Na,
312 | Brokerage clerks,3,Na,1,Na,1,Na,
313 | Correspondence clerks,3,Na,1,Na,2,Na,
314 | "Court, municipal, and license clerks",60,755,9,Na,51,743,
315 | "Credit authorizers, checkers, and clerks",42,Na,12,Na,29,Na,
316 | Customer service representatives,1760,621,611,690,1149,604,
317 | "Eligibility interviewers, government programs",67,781,17,Na,50,805,
318 | File clerks,145,634,25,Na,120,627,
319 | "Hotel, motel, and resort desk clerks",127,481,58,486,69,467,
320 | "Interviewers, except eligibility and loan",105,615,16,Na,89,617,
321 | "Library assistants, clerical",35,Na,7,Na,28,Na,
322 | Loan interviewers and clerks,134,710,25,Na,109,722,
323 | New accounts clerks,20,Na,3,Na,17,Na,
324 | Order clerks,74,599,26,Na,48,Na,
325 | "Human resources assistants, except payroll and timekeeping",50,737,11,Na,40,Na,
326 | Receptionists and information clerks,852,575,72,619,781,569,
327 | Reservation and transportation ticket agents and travel clerks,95,713,34,Na,61,680,
328 | "Information and record clerks, all other",100,618,22,Na,78,616,
329 | Cargo and freight agents,20,Na,12,Na,9,Na,
330 | Couriers and messengers,153,752,134,750,19,Na,Courier
331 | Dispatchers,250,698,109,759,141,655,
332 | "Meter readers, utilities",39,Na,34,Na,4,Na,
333 | Postal service clerks,127,927,76,974,51,833,
334 | Postal service mail carriers,302,954,187,1021,115,854,
335 | "Postal service mail sorters, processors, and processing machine operators",53,828,27,Na,26,Na,
336 | "Production, planning, and expediting clerks",256,838,116,978,141,732,
337 | "Shipping, receiving, and traffic clerks",502,591,354,604,148,566,
338 | Stock clerks and order fillers,1027,520,651,537,376,506,
339 | "Weighers, measurers, checkers, and samplers, recordkeeping",59,629,29,Na,31,Na,
340 | Secretaries and administrative assistants,2223,687,124,786,2099,683,
341 | Computer operators,58,751,25,Na,33,Na,
342 | Data entry keyers,223,619,55,589,169,638,
343 | Word processors and typists,68,650,6,Na,62,639,
344 | Desktop publishers,1,Na,1,Na,0,Na,
345 | Insurance claims and policy processing clerks,259,689,56,762,203,675,
346 | "Mail clerks and mail machine operators, except postal service",63,563,24,Na,39,Na,
347 | "Office clerks, general",929,620,156,609,773,622,"clerk, secretary"
348 | "Office machine operators, except computer",31,Na,14,Na,16,Na,
349 | Proofreaders and copy markers,2,Na,0,Na,2,Na,
350 | Statistical assistants,15,Na,5,Na,10,Na,
351 | "Office and administrative support workers, all other",391,745,93,852,298,718,
352 | AGRICULTURAL,810,464,637,477,174,437,Agricultural
353 | "First-line supervisors of farming, fishing, and forestry workers",42,Na,32,Na,10,Na,
354 | Agricultural inspectors,12,Na,7,Na,6,Na,
355 | Animal breeders,2,Na,2,Na,0,Na,breeder
356 | "Graders and sorters, agricultural products",83,486,32,Na,51,468,
357 | Miscellaneous agricultural workers,613,445,511,460,102,398,
358 | Fishers and related fishing workers,11,Na,11,Na,0,Na,
359 | Hunters and trappers,0,Na,0,Na,0,Na,"Hunter, trapper"
360 | Forest and conservation workers,15,Na,10,Na,5,Na,
361 | Logging workers,31,Na,31,Na,0,Na,
362 | CONSTRUCTION,5722,749,5586,751,137,704,"constructor, builder"
363 | First-line supervisors of construction trades and extraction workers,560,1040,540,1047,20,Na,
364 | Boilermakers,21,Na,21,Na,0,Na,Boilermaker
365 | "Brickmasons, blockmasons, and stonemasons",122,652,122,652,0,Na,
366 | Carpenters,802,687,792,687,10,Na,Carpenter
367 | "Carpet, floor, and tile installers and finishers",89,637,89,634,1,Na,Installer
368 | "Cement masons, concrete finishers, and terrazzo workers",44,Na,44,Na,0,Na,
369 | Construction laborers,1181,639,1155,642,25,Na,
370 | "Paving, surfacing, and tamping equipment operators",10,Na,10,Na,0,Na,
371 | Pile-driver operators,2,Na,2,Na,0,Na,
372 | Operating engineers and other construction equipment operators,324,856,318,859,6,Na,
373 | "Drywall installers, ceiling tile installers, and tapers",121,596,119,595,2,Na,
374 | Electricians,651,888,632,891,19,Na,Electrician
375 | Glaziers,33,Na,32,Na,0,Na,Glazier
376 | Insulation workers,43,Na,41,Na,3,Na,
377 | "Painters, construction and maintenance",344,585,330,587,14,Na,Painter
378 | Paperhangers,0,Na,0,Na,0,Na,
379 | "Pipelayers, plumbers, pipefitters, and steamfitters",456,863,455,862,2,Na,plumber
380 | Plasterers and stucco masons,20,Na,19,Na,1,Na,
381 | Reinforcing iron and rebar workers,9,Na,9,Na,0,Na,
382 | Roofers,171,584,170,580,2,Na,Roofer
383 | Sheet metal workers,106,766,100,776,6,Na,
384 | Structural iron and steel workers,54,869,52,864,2,Na,
385 | Solar photovoltaic installers,8,Na,8,Na,0,Na,
386 | "Helpers, construction trades",47,Na,47,Na,0,Na,"trader, arborist, dealer"
387 | Construction and building inspectors,67,939,58,965,9,Na,inspector
388 | Elevator installers and repairers,23,Na,23,Na,0,Na,repairer
389 | Fence erectors,33,Na,33,Na,0,Na,
390 | Hazardous materials removal workers,39,Na,35,Na,5,Na,
391 | Highway maintenance workers,91,754,91,755,0,Na,
392 | Rail-track laying and maintenance equipment operators,9,Na,9,Na,0,Na,
393 | Septic tank servicers and sewer pipe cleaners,8,Na,8,Na,0,Na,
394 | Miscellaneous construction and related workers,25,Na,23,Na,2,Na,
395 | "Derrick, rotary drill, and service unit operators, oil, gas, and mining",28,Na,27,Na,1,Na,
396 | "Earth drillers, except oil and gas",30,Na,30,Na,0,Na,
397 | "Explosives workers, ordnance handling experts, and blasters",5,Na,5,Na,0,Na,
398 | Mining machine operators,68,1106,65,1098,2,Na,
399 | "Roof bolters, mining",3,Na,3,Na,0,Na,
400 | "Roustabouts, oil and gas",7,Na,7,Na,0,Na,
401 | Helpers--extraction workers,6,Na,6,Na,0,Na,
402 | Other extraction workers,61,900,58,918,3,Na,
403 | MAINTENANCE,4301,839,4159,842,143,761,
404 | "First-line supervisors of mechanics, installers, and repairers",270,1032,252,1033,18,Na,
405 | "Computer, automated teller, and office machine repairers",194,856,166,865,28,Na,
406 | Radio and telecommunications equipment installers and repairers,139,862,126,879,13,Na,
407 | Avionics technicians,4,Na,4,Na,0,Na,
408 | "Electric motor, power tool, and related repairers",22,Na,20,Na,1,Na,
409 | "Electrical and electronics installers and repairers, transportation equipment",2,Na,2,Na,0,Na,
410 | "Electrical and electronics repairers, industrial and utility",18,Na,17,Na,0,Na,
411 | "Electronic equipment installers and repairers, motor vehicles",17,Na,17,Na,0,Na,
412 | Electronic home entertainment equipment installers and repairers,30,Na,28,Na,2,Na,
413 | Security and fire alarm systems installers,67,911,65,913,2,Na,
414 | Aircraft mechanics and service technicians,133,1025,125,1032,7,Na,
415 | Automotive body and related repairers,120,846,118,849,2,Na,
416 | Automotive glass installers and repairers,21,Na,20,Na,1,Na,
417 | Automotive service technicians and mechanics,710,722,694,724,16,Na,
418 | Bus and truck mechanics and diesel engine specialists,327,831,327,830,0,Na,
419 | Heavy vehicle and mobile equipment service technicians and mechanics,206,928,206,928,0,Na,
420 | Small engine mechanics,39,Na,39,Na,0,Na,
421 | "Miscellaneous vehicle and mobile equipment mechanics, installers, and repairers",66,592,65,591,1,Na,
422 | Control and valve installers and repairers,23,Na,23,Na,0,Na,
423 | "Heating, air conditioning, and refrigeration mechanics and installers",341,806,337,810,4,Na,
424 | Home appliance repairers,36,Na,36,Na,0,Na,
425 | Industrial and refractory machinery mechanics,394,895,383,894,11,Na,
426 | "Maintenance and repair workers, general",469,773,459,771,10,Na,
427 | "Maintenance workers, machinery",31,Na,30,Na,1,Na,
428 | Millwrights,49,Na,48,Na,1,Na,
429 | Electrical power-line installers and repairers,113,1105,112,1105,0,Na,
430 | Telecommunications line installers and repairers,157,882,148,880,9,Na,
431 | Precision instrument and equipment repairers,64,996,60,1009,4,Na,
432 | Wind turbine service technicians,3,Na,3,Na,0,Na,
433 | "Coin, vending, and amusement machine servicers and repairers",38,Na,34,Na,4,Na,
434 | Commercial divers,1,Na,1,Na,0,Na,
435 | Locksmiths and safe repairers,12,Na,12,Na,0,Na,
436 | Manufactured building and mobile home installers,4,Na,4,Na,1,Na,
437 | Riggers,7,Na,7,Na,0,Na,Rigger
438 | Signal and track switch repairers,9,Na,9,Na,0,Na,
439 | "Helpers--installation, maintenance, and repair workers",17,Na,17,Na,0,Na,
440 | "Other installation, maintenance, and repair workers",150,792,144,810,6,Na,
441 | PRODUCTION,7551,663,5548,729,2003,519,
442 | First-line supervisors of production and operating workers,783,875,650,924,133,623,
443 | "Aircraft structure, surfaces, rigging, and systems assemblers",15,Na,11,Na,4,Na,
444 | "Electrical, electronics, and electromechanical assemblers",123,554,59,566,64,544,
445 | Engine and other machine assemblers,14,Na,12,Na,2,Na,
446 | Structural metal fabricators and fitters,31,Na,28,Na,3,Na,
447 | Miscellaneous assemblers and fabricators,950,581,573,637,377,512,
448 | Bakers,150,505,69,570,80,475,baker
449 | "Butchers and other meat, poultry, and fish processing workers",247,542,187,582,60,463,
450 | "Food and tobacco roasting, baking, and drying machine operators and tenders",9,Na,7,Na,1,Na,
451 | Food batchmakers,79,500,25,Na,54,489,
452 | Food cooking machine operators and tenders,7,Na,5,Na,2,Na,
453 | "Food processing workers, all other",132,594,82,679,50,508,
454 | Computer control programmers and operators,83,833,81,857,2,Na,
455 | "Extruding and drawing machine setters, operators, and tenders, metal and plastic",8,Na,7,Na,1,Na,
456 | "Forging machine setters, operators, and tenders, metal and plastic",6,Na,6,Na,0,Na,
457 | "Rolling machine setters, operators, and tenders, metal and plastic",15,Na,12,Na,3,Na,
458 | "Cutting, punching, and press machine setters, operators, and tenders, metal and plastic",78,633,62,674,15,Na,
459 | "Drilling and boring machine tool setters, operators, and tenders, metal and plastic",5,Na,5,Na,1,Na,
460 | "Grinding, lapping, polishing, and buffing machine tool setters, operators, and tenders, metal and plastic",41,Na,39,Na,3,Na,
461 | "Lathe and turning machine tool setters, operators, and tenders, metal and plastic",12,Na,11,Na,1,Na,
462 | "Milling and planing machine setters, operators, and tenders, metal and plastic",1,Na,1,Na,0,Na,
463 | Machinists,338,834,320,840,17,Na,Machinist
464 | "Metal furnace operators, tenders, pourers, and casters",29,Na,28,Na,1,Na,
465 | "Model makers and patternmakers, metal and plastic",6,Na,3,Na,3,Na,
466 | "Molders and molding machine setters, operators, and tenders, metal and plastic",47,Na,39,Na,9,Na,
467 | "Multiple machine tool setters, operators, and tenders, metal and plastic",1,Na,1,Na,0,Na,
468 | Tool and die makers,49,Na,49,Na,0,Na,
469 | "Welding, soldering, and brazing workers",568,760,545,767,23,Na,
470 | "Heat treating equipment setters, operators, and tenders, metal and plastic",4,Na,4,Na,0,Na,
471 | "Layout workers, metal and plastic",4,Na,4,Na,1,Na,
472 | "Plating and coating machine setters, operators, and tenders, metal and plastic",24,Na,24,Na,0,Na,
473 | "Tool grinders, filers, and sharpeners",7,Na,7,Na,0,Na,
474 | "Metal workers and plastic workers, all other",351,639,278,678,72,581,
475 | Prepress technicians and workers,14,Na,12,Na,2,Na,
476 | Printing press operators,160,707,134,729,26,Na,
477 | Print binding and finishing workers,16,Na,9,Na,6,Na,
478 | Laundry and dry-cleaning workers,133,466,53,487,80,460,
479 | "Pressers, textile, garment, and related materials",21,Na,9,Na,12,Na,
480 | Sewing machine operators,147,493,42,Na,105,476,
481 | Shoe and leather workers and repairers,5,Na,4,Na,1,Na,
482 | Shoe machine operators and tenders,1,Na,0,Na,1,Na,
483 | "Tailors, dressmakers, and sewers",37,Na,9,Na,27,Na,"Tailor, dressmaker"
484 | Textile bleaching and dyeing machine operators and tenders,2,Na,2,Na,0,Na,
485 | "Textile cutting machine setters, operators, and tenders",9,Na,7,Na,2,Na,
486 | "Textile knitting and weaving machine setters, operators, and tenders",8,Na,3,Na,4,Na,
487 | "Textile winding, twisting, and drawing out machine setters, operators, and tenders",7,Na,5,Na,2,Na,
488 | "Extruding and forming machine setters, operators, and tenders, synthetic and glass fibers",0,Na,0,Na,0,Na,
489 | Fabric and apparel patternmakers,4,Na,3,Na,1,Na,
490 | Upholsterers,29,Na,21,Na,7,Na,Upholsterer
491 | "Textile, apparel, and furnishings workers, all other",16,Na,12,Na,4,Na,
492 | Cabinetmakers and bench carpenters,40,Na,38,Na,2,Na,
493 | Furniture finishers,6,Na,6,Na,0,Na,
494 | "Model makers and patternmakers, wood",0,Na,0,Na,0,Na,
495 | "Sawing machine setters, operators, and tenders, wood",26,Na,22,Na,4,Na,
496 | "Woodworking machine setters, operators, and tenders, except sawing",23,Na,21,Na,1,Na,
497 | "Woodworkers, all other",17,Na,14,Na,3,Na,Woodworker
498 | "Power plant operators, distributors, and dispatchers",35,Na,34,Na,1,Na,
499 | Stationary engineers and boiler operators,84,996,81,1012,3,Na,
500 | Water and wastewater treatment plant and system operators,82,880,79,868,3,Na,
501 | Miscellaneous plant and system operators,35,Na,33,Na,3,Na,
502 | "Chemical processing machine setters, operators, and tenders",62,1052,57,1082,5,Na,
503 | "Crushing, grinding, polishing, mixing, and blending workers",82,652,75,668,7,Na,
504 | Cutting workers,51,685,41,Na,9,Na,
505 | "Extruding, forming, pressing, and compacting machine setters, operators, and tenders",31,Na,25,Na,6,Na,
506 | "Furnace, kiln, oven, drier, and kettle operators and tenders",6,Na,5,Na,0,Na,
507 | "Inspectors, testers, sorters, samplers, and weighers",701,710,440,844,260,583,
508 | Jewelers and precious stone and metal workers,19,Na,11,Na,7,Na,Jeweler
509 | "Medical, dental, and ophthalmic laboratory technicians",86,648,44,Na,42,Na,
510 | Packaging and filling machine operators and tenders,239,518,118,605,120,482,
511 | Painting workers,129,708,110,733,18,Na,
512 | Photographic process workers and processing machine operators,26,Na,12,Na,14,Na,
513 | Semiconductor processors,1,Na,1,Na,0,Na,
514 | Adhesive bonding machine operators and tenders,9,Na,5,Na,4,Na,
515 | "Cleaning, washing, and metal pickling equipment operators and tenders",4,Na,2,Na,1,Na,
516 | Cooling and freezing equipment operators and tenders,4,Na,3,Na,0,Na,
517 | Etchers and engravers,12,Na,8,Na,4,Na,engraver
518 | "Molders, shapers, and casters, except metal and plastic",14,Na,12,Na,2,Na,
519 | "Paper goods machine setters, operators, and tenders",27,Na,22,Na,5,Na,
520 | Tire builders,8,Na,8,Na,0,Na,
521 | Helpers--production workers,24,Na,18,Na,7,Na,
522 | "Production workers, all other",846,625,643,666,203,501,
523 | TRANSPORTATION,6953,646,5998,679,955,494,
524 | Supervisors of transportation and material moving workers,186,894,153,898,33,Na,
525 | Aircraft pilots and flight engineers,114,1735,104,1830,9,Na,pilot
526 | Air traffic controllers and airfield operations specialists,32,Na,24,Na,8,Na,
527 | Flight attendants,63,846,20,Na,43,Na,
528 | "Ambulance drivers and attendants, except emergency medical technicians",18,Na,14,Na,4,Na,
529 | Bus drivers,323,615,184,681,138,572,
530 | Driver/sales workers and truck drivers,2687,747,2582,751,105,632,driver
531 | Taxi drivers and chauffeurs,253,585,216,600,38,Na,
532 | "Motor vehicle operators, all other",21,Na,18,Na,3,Na,
533 | Locomotive engineers and operators,44,Na,42,Na,2,Na,
534 | "Railroad brake, signal, and switch operators",5,Na,5,Na,0,Na,
535 | Railroad conductors and yardmasters,55,1117,52,1137,4,Na,yardmaster
536 | "Subway, streetcar, and other rail transportation workers",15,Na,12,Na,3,Na,
537 | Sailors and marine oilers,10,Na,9,Na,0,Na,sailor
538 | Ship and boat captains and operators,29,Na,28,Na,1,Na,captain
539 | Ship engineers,5,Na,4,Na,1,Na,
540 | Bridge and lock tenders,4,Na,4,Na,0,Na,
541 | Parking lot attendants,57,492,49,Na,8,Na,
542 | Automotive and watercraft service attendants,63,452,58,470,5,Na,
543 | Transportation inspectors,21,Na,14,Na,7,Na,
544 | "Transportation attendants, except flight attendants",17,Na,9,Na,8,Na,
545 | Other transportation workers,39,Na,35,Na,4,Na,
546 | Conveyor operators and tenders,7,Na,7,Na,0,Na,
547 | Crane and tower operators,75,988,71,1016,4,Na,
548 | "Dredge, excavating, and loading machine operators",25,Na,25,Na,0,Na,
549 | Hoist and winch operators,5,Na,5,Na,0,Na,
550 | Industrial truck and tractor operators,579,609,541,612,37,Na,
551 | Cleaners of vehicles and equipment,222,485,200,498,22,Na,
552 | "Laborers and freight, stock, and material movers, hand",1433,526,1214,547,219,455,
553 | Machine feeders and offbearers,30,Na,21,Na,9,Na,
554 | "Packers and packagers, hand",385,438,158,462,227,424,
555 | Pumping station operators,18,Na,17,Na,1,Na,
556 | Refuse and recyclable material collectors,72,501,66,496,6,Na,
557 | Mine shuttle car operators,0,Na,0,Na,0,Na,
558 | "Tank car, truck, and ship loaders",6,Na,6,Na,0,Na,
559 | "Material moving workers, all other",37,Na,32,Na,5,Na,
560 | OTHER SOURCES OCCUPATION DATA,Na,Na,Na,Na,Na,Na,
561 | Presidents in the U.S,,,45,,0,,president
562 | "https://datacenter.kidscount.org/data/tables/102-child-population-by-gender#detailed/1/any/false/1729,37,871,870,573,869,36,868,867,133/14,15,65/421,422",,,51,,49,,child
563 | https://data.worldbank.org/indicator/SP.POP.TOTL.FE.ZS?locations=US,,,99,,101,,person
564 | immeasurable neutral,,,50,,50,,"friend, user, patient"
565 | https://nces.ed.gov/fastfacts/display.asp?id=98,,,43,,57,,student
566 | https://www.insidehighered.com/news/2016/08/22/study-finds-gains-faculty-diversity-not-tenure-track,,,69,,31,,professor
567 | https://cawp.rutgers.edu/women-elective-office-2021,,,70,,30,,politician
568 | 


--------------------------------------------------------------------------------
/visualizations/delta_s_by_dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SLAB-NLP/BUG/6b5314d193ecd04a6864ffbfe329b42cf2aa622e/visualizations/delta_s_by_dist.png


--------------------------------------------------------------------------------