├── LICENSE ├── README.md ├── code ├── cluster │ ├── __init__.py │ ├── cluster.py │ └── train_clusterer.py ├── filter │ ├── evaluate_ft_models.py │ ├── lr │ │ ├── hyperparameters.py │ │ ├── lr_quality_filters.py │ │ ├── train.py │ │ └── util.py │ ├── quality_data_org.py │ ├── rule_based_scores.py │ ├── sample_openwebtext2.py │ ├── score_manager.py │ ├── text_normalizer.py │ ├── wikipedia_perplexity.py │ └── zreader.py ├── get_data │ ├── bloomfilter.py │ ├── dataset_statistics.py │ ├── get_random_pages.py │ ├── url_processor.py │ └── website_expander.py └── identity_measures │ ├── geography │ ├── bayesequal.py │ ├── evaluate_geoparse.py │ ├── geo_why.py │ ├── locations.py │ ├── locations_helper.py │ └── sample_for_annotate.py │ ├── person_vs_orgs.py │ ├── personas │ ├── annotation_eval.py │ ├── apply_role_extractor.py │ ├── get_people_lists.py │ ├── get_role_occurrences.py │ ├── get_tapt_data.py │ ├── occupation_specific_language.py │ ├── scrape_onet.py │ └── spacy_pos.py │ ├── roberta_classifier │ ├── classification_results.py │ ├── roberta_token_classify.py │ └── run_mlm.py │ └── spacy_helper.py ├── data └── filter_data │ └── combined │ ├── OpenWebText2 │ ├── openwebtext2_clf.pkl │ └── openwebtext2_vectorizer.pkl │ ├── WikiRef │ ├── wikiref_clf.pkl │ └── wikiref_vectorizer.pkl │ ├── WikiWebBooks │ ├── wikiwebbooks_clf.pkl │ └── wikiwebbooks_vectorizer.pkl │ └── Wikipedia │ ├── wikipedia_clf.pkl │ └── wikipedia_vectorizer.pkl └── environment_for_filters_only.yml /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/README.md -------------------------------------------------------------------------------- /code/cluster/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/cluster/cluster.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/cluster/cluster.py -------------------------------------------------------------------------------- /code/cluster/train_clusterer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/cluster/train_clusterer.py -------------------------------------------------------------------------------- /code/filter/evaluate_ft_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/evaluate_ft_models.py -------------------------------------------------------------------------------- /code/filter/lr/hyperparameters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/lr/hyperparameters.py -------------------------------------------------------------------------------- /code/filter/lr/lr_quality_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/lr/lr_quality_filters.py -------------------------------------------------------------------------------- /code/filter/lr/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/lr/train.py -------------------------------------------------------------------------------- /code/filter/lr/util.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/lr/util.py -------------------------------------------------------------------------------- /code/filter/quality_data_org.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/quality_data_org.py -------------------------------------------------------------------------------- /code/filter/rule_based_scores.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/rule_based_scores.py -------------------------------------------------------------------------------- /code/filter/sample_openwebtext2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/sample_openwebtext2.py -------------------------------------------------------------------------------- /code/filter/score_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/score_manager.py -------------------------------------------------------------------------------- /code/filter/text_normalizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/text_normalizer.py -------------------------------------------------------------------------------- /code/filter/wikipedia_perplexity.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/wikipedia_perplexity.py -------------------------------------------------------------------------------- /code/filter/zreader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/filter/zreader.py -------------------------------------------------------------------------------- /code/get_data/bloomfilter.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/get_data/bloomfilter.py -------------------------------------------------------------------------------- /code/get_data/dataset_statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/get_data/dataset_statistics.py -------------------------------------------------------------------------------- /code/get_data/get_random_pages.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/get_data/get_random_pages.py -------------------------------------------------------------------------------- /code/get_data/url_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/get_data/url_processor.py -------------------------------------------------------------------------------- /code/get_data/website_expander.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/get_data/website_expander.py -------------------------------------------------------------------------------- /code/identity_measures/geography/bayesequal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/geography/bayesequal.py -------------------------------------------------------------------------------- /code/identity_measures/geography/evaluate_geoparse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/geography/evaluate_geoparse.py -------------------------------------------------------------------------------- /code/identity_measures/geography/geo_why.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/geography/geo_why.py -------------------------------------------------------------------------------- /code/identity_measures/geography/locations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/geography/locations.py -------------------------------------------------------------------------------- /code/identity_measures/geography/locations_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/geography/locations_helper.py -------------------------------------------------------------------------------- /code/identity_measures/geography/sample_for_annotate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/geography/sample_for_annotate.py -------------------------------------------------------------------------------- /code/identity_measures/person_vs_orgs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/person_vs_orgs.py -------------------------------------------------------------------------------- /code/identity_measures/personas/annotation_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/annotation_eval.py -------------------------------------------------------------------------------- /code/identity_measures/personas/apply_role_extractor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/apply_role_extractor.py -------------------------------------------------------------------------------- /code/identity_measures/personas/get_people_lists.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/get_people_lists.py -------------------------------------------------------------------------------- /code/identity_measures/personas/get_role_occurrences.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/get_role_occurrences.py -------------------------------------------------------------------------------- /code/identity_measures/personas/get_tapt_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/get_tapt_data.py -------------------------------------------------------------------------------- /code/identity_measures/personas/occupation_specific_language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/occupation_specific_language.py -------------------------------------------------------------------------------- /code/identity_measures/personas/scrape_onet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/scrape_onet.py -------------------------------------------------------------------------------- /code/identity_measures/personas/spacy_pos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/personas/spacy_pos.py -------------------------------------------------------------------------------- /code/identity_measures/roberta_classifier/classification_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/roberta_classifier/classification_results.py -------------------------------------------------------------------------------- /code/identity_measures/roberta_classifier/roberta_token_classify.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/roberta_classifier/roberta_token_classify.py -------------------------------------------------------------------------------- /code/identity_measures/roberta_classifier/run_mlm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/roberta_classifier/run_mlm.py -------------------------------------------------------------------------------- /code/identity_measures/spacy_helper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/code/identity_measures/spacy_helper.py -------------------------------------------------------------------------------- /data/filter_data/combined/OpenWebText2/openwebtext2_clf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/OpenWebText2/openwebtext2_clf.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/OpenWebText2/openwebtext2_vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/OpenWebText2/openwebtext2_vectorizer.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/WikiRef/wikiref_clf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/WikiRef/wikiref_clf.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/WikiRef/wikiref_vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/WikiRef/wikiref_vectorizer.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/WikiWebBooks/wikiwebbooks_clf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/WikiWebBooks/wikiwebbooks_clf.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/WikiWebBooks/wikiwebbooks_vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/WikiWebBooks/wikiwebbooks_vectorizer.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/Wikipedia/wikipedia_clf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/Wikipedia/wikipedia_clf.pkl -------------------------------------------------------------------------------- /data/filter_data/combined/Wikipedia/wikipedia_vectorizer.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/data/filter_data/combined/Wikipedia/wikipedia_vectorizer.pkl -------------------------------------------------------------------------------- /environment_for_filters_only.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lucy3/whos_filtered/HEAD/environment_for_filters_only.yml --------------------------------------------------------------------------------