├── .gitignore ├── NGEC ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-310.pyc │ ├── __init__.cpython-37.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── actor_resolution.cpython-310.pyc │ ├── actor_resolution.cpython-37.pyc │ ├── actor_resolution.cpython-38.pyc │ ├── actor_resolution.cpython-39.pyc │ ├── attribute_model.cpython-310.pyc │ ├── attribute_model.cpython-37.pyc │ ├── attribute_model.cpython-39.pyc │ ├── formatter.cpython-310.pyc │ ├── formatter.cpython-37.pyc │ ├── formatter.cpython-39.pyc │ ├── geolocation.cpython-310.pyc │ ├── geolocation.cpython-37.pyc │ ├── geolocation.cpython-39.pyc │ ├── utilities.cpython-310.pyc │ └── utilities.cpython-39.pyc ├── actor_resolution.py ├── assets │ ├── PLOVER_agents.hash │ ├── PLOVER_agents.txt │ ├── README.md │ ├── actor_sim_model2 │ │ ├── 1_Pooling │ │ │ └── config.json │ │ ├── README.md │ │ ├── config.json │ │ ├── config_sentence_transformers.json │ │ ├── eval │ │ │ └── similarity_evaluation_sts-dev_results.csv │ │ ├── modules.json │ │ ├── pytorch_model.bin │ │ ├── sentence_bert_config.json │ │ ├── special_tokens_map.json │ │ ├── tokenizer.json │ │ ├── tokenizer_config.json │ │ └── vocab.txt │ ├── bert_matrix.pkl │ ├── countries.csv │ ├── countries.numbers │ ├── cow2iso.txt │ ├── event_mode_questions.csv │ ├── event_models │ │ ├── ACCUSE.skops │ │ ├── AGREE.skops │ │ ├── AID.skops │ │ ├── ASSAULT.skops │ │ ├── COERCE.skops │ │ ├── CONCEDE.skops │ │ ├── CONSULT.skops │ │ ├── COOPERATE.skops │ │ ├── MOBILIZE.skops │ │ ├── PROTEST.skops │ │ ├── REJECT.skops │ │ ├── REQUEST.skops │ │ ├── RETREAT.skops │ │ ├── SANCTION.skops │ │ ├── SUPPORT.skops │ │ └── THREATEN.skops │ ├── option_model.pt │ ├── pattern_matrix.npy │ └── pattern_matrix.pkl ├── attribute_model.py ├── context_class.py ├── event_class.py ├── formatter.py ├── geolocation.py ├── mode_class.py ├── tests │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── conftest.cpython-39-pytest-7.0.1.pyc │ │ └── test_actor_resolution.cpython-39-pytest-7.0.1.pyc │ ├── conftest.py │ ├── test_actor_resolution.py │ ├── test_attribute_model.py │ ├── test_formatter.py │ └── test_multiple_actors.py └── utilities.py ├── README.md ├── examples ├── Guardian_SDF_sample.csv.zip ├── NGEC_pres.pdf ├── README.md ├── demo_mordecai.py └── demo_wiki_resolution.py ├── ngec_process.py ├── ngec_streamlit.py ├── requirements.txt ├── setup.py └── setup ├── README.md ├── train_classifiers ├── README.md ├── fit_event_classifier.py ├── generate_synthetic_news.py ├── gpt_synthetic_events_2023-10-19_19.csv.zip └── synthetic_headlines.csv └── wiki ├── README.md ├── actor_contrastive_data.py ├── create_index.sh ├── load_wiki_es.py ├── load_wiki_scratch.py ├── requirements.txt └── wiki_mapping.json /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | NGEC/__pycache__* 4 | *.pyc 5 | *.hash 6 | NGEC/assets/bert_matrix.pkl 7 | -------------------------------------------------------------------------------- /NGEC/__init__.py: -------------------------------------------------------------------------------- 1 | from .event_class import EventClass 2 | from .actor_resolution import ActorResolver 3 | from .geolocation import GeolocationModel 4 | from .attribute_model import AttributeModel 5 | from .formatter import Formatter 6 | -------------------------------------------------------------------------------- /NGEC/__pycache__/__init__.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-310.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/actor_resolution.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-310.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/actor_resolution.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-37.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/actor_resolution.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-38.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/actor_resolution.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/attribute_model.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/attribute_model.cpython-310.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/attribute_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/attribute_model.cpython-37.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/attribute_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/attribute_model.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/formatter.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/formatter.cpython-310.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/formatter.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/formatter.cpython-37.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/formatter.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/formatter.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/geolocation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/geolocation.cpython-310.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/geolocation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/geolocation.cpython-37.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/geolocation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/geolocation.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/utilities.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/utilities.cpython-310.pyc -------------------------------------------------------------------------------- /NGEC/__pycache__/utilities.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/utilities.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/assets/PLOVER_agents.hash: -------------------------------------------------------------------------------- 1 | 1359402526471412912 -------------------------------------------------------------------------------- /NGEC/assets/README.md: -------------------------------------------------------------------------------- 1 | - ISO --> COW conversion document. Credit: https://github.com/leops95/cow2iso -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/1_Pooling/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "word_embedding_dimension": 384, 3 | "pooling_mode_cls_token": false, 4 | "pooling_mode_mean_tokens": true, 5 | "pooling_mode_max_tokens": false, 6 | "pooling_mode_mean_sqrt_len_tokens": false 7 | } -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | pipeline_tag: sentence-similarity 3 | tags: 4 | - sentence-transformers 5 | - feature-extraction 6 | - sentence-similarity 7 | - transformers 8 | 9 | --- 10 | 11 | # {MODEL_NAME} 12 | 13 | This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search. 14 | 15 | 16 | 17 | ## Usage (Sentence-Transformers) 18 | 19 | Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed: 20 | 21 | ``` 22 | pip install -U sentence-transformers 23 | ``` 24 | 25 | Then you can use the model like this: 26 | 27 | ```python 28 | from sentence_transformers import SentenceTransformer 29 | sentences = ["This is an example sentence", "Each sentence is converted"] 30 | 31 | model = SentenceTransformer('{MODEL_NAME}') 32 | embeddings = model.encode(sentences) 33 | print(embeddings) 34 | ``` 35 | 36 | 37 | 38 | ## Usage (HuggingFace Transformers) 39 | Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings. 40 | 41 | ```python 42 | from transformers import AutoTokenizer, AutoModel 43 | import torch 44 | 45 | 46 | #Mean Pooling - Take attention mask into account for correct averaging 47 | def mean_pooling(model_output, attention_mask): 48 | token_embeddings = model_output[0] #First element of model_output contains all token embeddings 49 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 50 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 51 | 52 | 53 | # Sentences we want sentence embeddings for 54 | sentences = ['This is an example sentence', 'Each sentence is converted'] 55 | 56 | # Load model from HuggingFace Hub 57 | tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}') 58 | model = AutoModel.from_pretrained('{MODEL_NAME}') 59 | 60 | # Tokenize sentences 61 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt') 62 | 63 | # Compute token embeddings 64 | with torch.no_grad(): 65 | model_output = model(**encoded_input) 66 | 67 | # Perform pooling. In this case, mean pooling. 68 | sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 69 | 70 | print("Sentence embeddings:") 71 | print(sentence_embeddings) 72 | ``` 73 | 74 | 75 | 76 | ## Evaluation Results 77 | 78 | 79 | 80 | For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME}) 81 | 82 | 83 | ## Training 84 | The model was trained with the parameters: 85 | 86 | **DataLoader**: 87 | 88 | `torch.utils.data.dataloader.DataLoader` of length 30311 with parameters: 89 | ``` 90 | {'batch_size': 128, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'} 91 | ``` 92 | 93 | **Loss**: 94 | 95 | `sentence_transformers.losses.ContrastiveLoss.ContrastiveLoss` with parameters: 96 | ``` 97 | {'distance_metric': 'SiameseDistanceMetric.COSINE_DISTANCE', 'margin': 0.5, 'size_average': True} 98 | ``` 99 | 100 | Parameters of the fit()-Method: 101 | ``` 102 | { 103 | "epochs": 3, 104 | "evaluation_steps": 40000, 105 | "evaluator": "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator", 106 | "max_grad_norm": 1, 107 | "optimizer_class": "", 108 | "optimizer_params": { 109 | "lr": 0.0001 110 | }, 111 | "scheduler": "WarmupLinear", 112 | "steps_per_epoch": null, 113 | "warmup_steps": 9094, 114 | "weight_decay": 0.01 115 | } 116 | ``` 117 | 118 | 119 | ## Full Model Architecture 120 | ``` 121 | SentenceTransformer( 122 | (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 123 | (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False}) 124 | ) 125 | ``` 126 | 127 | ## Citing & Authors 128 | 129 | -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "/home/gridsan/ahalt/huggingface_models/paraphrase-MiniLM-L3-v2/", 3 | "architectures": [ 4 | "BertModel" 5 | ], 6 | "attention_probs_dropout_prob": 0.1, 7 | "classifier_dropout": null, 8 | "gradient_checkpointing": false, 9 | "hidden_act": "gelu", 10 | "hidden_dropout_prob": 0.1, 11 | "hidden_size": 384, 12 | "initializer_range": 0.02, 13 | "intermediate_size": 1536, 14 | "layer_norm_eps": 1e-12, 15 | "max_position_embeddings": 512, 16 | "model_type": "bert", 17 | "num_attention_heads": 12, 18 | "num_hidden_layers": 3, 19 | "pad_token_id": 0, 20 | "position_embedding_type": "absolute", 21 | "torch_dtype": "float32", 22 | "transformers_version": "4.24.0", 23 | "type_vocab_size": 2, 24 | "use_cache": true, 25 | "vocab_size": 30522 26 | } 27 | -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/config_sentence_transformers.json: -------------------------------------------------------------------------------- 1 | { 2 | "__version__": { 3 | "sentence_transformers": "2.0.0", 4 | "transformers": "4.7.0", 5 | "pytorch": "1.9.0+cu102" 6 | } 7 | } -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/eval/similarity_evaluation_sts-dev_results.csv: -------------------------------------------------------------------------------- 1 | epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman 2 | 0,-1,0.6109547883709737,0.5786435852182326,0.5838057989492854,0.5366218195030185,0.5832466180054964,0.536238540813903,0.5818760709295144,0.5343506422729409 3 | 1,-1,0.6801764356662623,0.6354600600378237,0.665073766144093,0.6068232368839985,0.664451226644231,0.6063922669643912,0.6804853813087184,0.6128786592142725 4 | 2,-1,0.7036958969271553,0.6537842300050594,0.7021022027187646,0.6338679348386188,0.7013755290609477,0.6335184031333557,0.7054851039552567,0.6324461529849091 5 | -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/modules.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "idx": 0, 4 | "name": "0", 5 | "path": "", 6 | "type": "sentence_transformers.models.Transformer" 7 | }, 8 | { 9 | "idx": 1, 10 | "name": "1", 11 | "path": "1_Pooling", 12 | "type": "sentence_transformers.models.Pooling" 13 | } 14 | ] -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/pytorch_model.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/actor_sim_model2/pytorch_model.bin -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/sentence_bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "max_seq_length": 128, 3 | "do_lower_case": false 4 | } -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/special_tokens_map.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "mask_token": "[MASK]", 4 | "pad_token": "[PAD]", 5 | "sep_token": "[SEP]", 6 | "unk_token": "[UNK]" 7 | } 8 | -------------------------------------------------------------------------------- /NGEC/assets/actor_sim_model2/tokenizer_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "cls_token": "[CLS]", 3 | "do_basic_tokenize": true, 4 | "do_lower_case": true, 5 | "mask_token": "[MASK]", 6 | "model_max_length": 512, 7 | "name_or_path": "/home/gridsan/ahalt/huggingface_models/paraphrase-MiniLM-L3-v2/", 8 | "never_split": null, 9 | "pad_token": "[PAD]", 10 | "sep_token": "[SEP]", 11 | "special_tokens_map_file": "/home/gridsan/ahalt/huggingface_models/paraphrase-MiniLM-L3-v2/special_tokens_map.json", 12 | "strip_accents": null, 13 | "tokenize_chinese_chars": true, 14 | "tokenizer_class": "BertTokenizer", 15 | "unk_token": "[UNK]" 16 | } 17 | -------------------------------------------------------------------------------- /NGEC/assets/bert_matrix.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/bert_matrix.pkl -------------------------------------------------------------------------------- /NGEC/assets/countries.csv: -------------------------------------------------------------------------------- 1 | CCA2,Name,CCA3,Nationality 2 | AF,Afghanistan,AFG,"Afghan, Afghani, Afghans, Islamic Republic of Afghanistan" 3 | AL,Albania,ALB,"Albanian, Albanians" 4 | DZ,Algeria,DZA,"Algerian, Algerians" 5 | AD,Andorra,AND,"Andorran, Andorrans" 6 | AO,Angola,AGO,"Angolan, Angolans" 7 | AG,Antigua and Barbuda,ATG,"Antiguan, Barbudan, Antiguans, Barbudans" 8 | AR,Argentina,ARG,"Argentinean, Argentine, Argentineans, Argentines" 9 | AM,Armenia,ARM,"Armenian, Armenians" 10 | AU,Australia,AUS,"Australian, Australians,Christmas Island,Christmas Islander, Christmas Islanders,Keeling Islands, Cocos [Keeling] Islands,Cocos Islander,Heard Island and McDonald Island,Heard and McDonald Islander,Norfolk Island,Norfolk Islander" 11 | AT,Austria,AUT,"Austrian, Austrians" 12 | AZ,Azerbaijan,AZE,"Azerbaijani, Azeri, Azerbaijanis, Azeris" 13 | BS,Bahamas,BHS,"Bahamian, Bahamian" 14 | BH,Bahrain,BHR,"Bahraini, Bahrainis" 15 | BD,Bangladesh,BGD,"Bangladeshi, Bangladeshis" 16 | BB,Barbados,BRB,"Barbadian, Barbadians" 17 | BY,Belarus,BLR,"Belarusian, Belarusians" 18 | BE,Belgium,BEL,"Belgian, Belgians" 19 | BZ,Belize,BLZ,"Belizean, Belizeans" 20 | BJ,Benin,BEN,Beninese 21 | BT,Bhutan,BTN,Bhutanese 22 | BO,Bolivia,BOL,"Bolivian, Bolivians" 23 | BA,Bosnia and Herzegovina,BIH,"Bosnian, Bosniak,Herzegovinian,Bosnians,Bosniaks,Herzegovinians" 24 | BW,Botswana,BWA,"Motswana, Botswanan" 25 | BR,Brazil,BRA,"Brazilian, Brazilians" 26 | BN,Brunei,BRN,"Bruneian, Bruneians" 27 | BG,Bulgaria,BGR,"Bulgarian, Bulgarians" 28 | BF,Burkina Faso,BFA,Burkinabe 29 | BI,Burundi,BDI,"Burundian, Burundians" 30 | KH,Cambodia,KHM,"Cambodian, Cambodians" 31 | CM,Cameroon,CMR,"Cameroonian, Cameroonians" 32 | CA,Canada,CAN,"Canadian, Canadians" 33 | CV,Cabo Verde,CPV,"Cape Verdian,Cabo Verde,Cape Verde" 34 | CF,Central African Republic,CAF,"Central African, Central Africans" 35 | TD,Chad,TCD,"Chadian,Chadians" 36 | CL,Chile,CHL,"Chilean, Chileans" 37 | CN,China,CHN,"Chinese, People's Republic of China" 38 | CO,Colombia,COL,"Colombian, Colombians" 39 | KM,Comoros,COM,"Comoran, Comorans" 40 | CG,Congo,COG,"Republic of the Congo,Congolese,Congo-Brazzaville" 41 | CR,Costa Rica,CRI,"Costa Rican, Costa Ricans" 42 | CI,Cote d'Ivoire,CIV,"Ivorian, Ivorians, Cote d’Ivoire, Ivory Coast, Côte d’Ivoire" 43 | HR,Croatia,HRV,"Croatian, Croat, Croatians, Croats" 44 | CU,Cuba,CUB,"Cuban, Cubans" 45 | CY,Cyprus,CYP,"Cypriot, Cypriots" 46 | CZ,Czechia,CZE,"Czech, Czechs,Czech Republic" 47 | CD,Democratic Republic of the Congo,COD,"DRC,DR Congo,Zaire" 48 | DK,Denmark,DNK,"Danish, Dane, Danes,Faroe Islands,Faroese,Faroe Islander,Greenland,Greenlandic, Greenlander, Greenlanders" 49 | DJ,Djibouti,DJI,"Djiboutian, Djiboutians" 50 | DO,Dominican Republic,DOM,"Dominican, Dominicans, Dominicanos" 51 | EC,Ecuador,ECU,"Ecuadorean, Ecuadoreans" 52 | EG,Egypt,EGY,"Egyptian, Egyptians" 53 | SV,El Salvador,SLV,"Salvadoran,Salvadorans" 54 | GQ,Equatorial Guinea,GNQ,"Equatorial Guinean, Equatorial Guineans" 55 | ER,Eritrea,ERI,"Eritrean, Eritreans" 56 | EE,Estonia,EST,"Estonian, Estonians" 57 | ET,Ethiopia,ETH,"Ethiopian, Ethiopians" 58 | NA,European Union,EUR,"European Union, EU" 59 | FJ,Fiji,FJI,"Fijian, Fijians" 60 | FI,Finland,FIN,"Finnish, Finn, Finns,Åland Islands,Åland Islander" 61 | FR,France,FRA,"French,French Guiana,French Polynesia,French Polynesian,French Polynesians,French Southern Territories,Guadeloupe,Guadeloupian, Guadeloupians,Martinique,Mayotte,New Caledonia,New Caledonian,New Caledonians,Réunion,Saint Barthélemy,Saint Martin,Saint Martin Islander,Saint Martin Islanders,Saint Pierre and Miquelon,Wallis and Futuna,Wallis and Futuna Islander" 62 | GA,Gabon,GAB,Gabonese 63 | GM,Gambia,GMB,"Gambian, Gambians" 64 | GE,Georgia,GEO,"Georgian, Georgians" 65 | DE,Germany,DEU,"German, Germans" 66 | GH,Ghana,GHA,"Ghanaian, Ghanaians" 67 | GR,Greece,GRC,"Greek, Greeks" 68 | GD,Grenada,GRD,Grenadian 69 | GT,Guatemala,GTM,"Guatemalan, Guatemalans" 70 | GN,Guinea,GIN,"Guinean, Guineans" 71 | GW,Guinea-Bissau,GNB,"Guinea-Bissauan, Guinea-Bissauans" 72 | GY,Guyana,GUY,Guyanese 73 | HT,Haiti,HTI,"Haitian, Haitians" 74 | HN,Honduras,HND,"Honduran, Hondurans" 75 | HK,Hong Kong SAR China,HKG,"Hong Kong, Hong Konger, Hong Kongers,Hong Kongese" 76 | HU,Hungary,HUN,"Hungarian, Hungarians" 77 | IS,Iceland,ISL,"Icelander, Icelanders" 78 | IN,India,IND,"Indian, Indians" 79 | ID,Indonesia,IDN,"Indonesian, Indonesians" 80 | IR,Iran,IRN,"Iranian, Iranians,Islamic Republic of Iran" 81 | IQ,Iraq,IRQ,"Iraqi, Iraqis" 82 | IE,Ireland,IRL,Irish 83 | IL,Israel,ISR,"Israeli, Israelis" 84 | IT,Italy,ITA,"Italian, Italians" 85 | JM,Jamaica,JAM,"Jamaican, Jamaicans" 86 | JP,Japan,JPN,Japanese 87 | JO,Jordan,JOR,"Jordanian, Jordanians" 88 | KZ,Kazakhstan,KAZ,"Kazakhstani, Kazakh, Kazakhs" 89 | KE,Kenya,KEN,"Kenyan, Kenyan" 90 | KI,Kiribati,KIR,I-Kiribati 91 | XK,Kosovo,XKX,"Kosovar, Kosovars, Kosovans, Kosovan" 92 | KW,Kuwait,KWT,"Kuwaiti, Kuwaitis" 93 | KG,Kyrgyzstan,KGZ,"Kirghiz, Kyrgyz" 94 | LA,Laos,LAO,"Laotian,Laotians,Lao People's Democratic Republic" 95 | LV,Latvia,LVA,"Latvian,Latvians" 96 | LB,Lebanon,LBN,Lebanese 97 | LS,Lesotho,LSO,"Mosotho,Mosothos" 98 | LR,Liberia,LBR,"Liberian,Liberians" 99 | LY,Libya,LBY,"Libyan,Libyans" 100 | LI,Liechtenstein,LIE,"Liechtensteiner, Liechtensteiners" 101 | LT,Lithuania,LTU,"Lithuanian,Lithuanians" 102 | LU,Luxembourg,LUX,"Luxembourger,Luxembourgers" 103 | MK,North Macedonia,MKD,"Macedonia,Macedonian,Macedonians,FYROM" 104 | MG,Madagascar,MDG,Malagasy 105 | MW,Malawi,MWI,"Malawian,Malawians,Malawis" 106 | MY,Malaysia,MYS,"Malaysian,Malaysians" 107 | MV,Maldives,MDV,"Maldivan,Maldivans" 108 | ML,Mali,MLI,"Malian,Malians" 109 | MT,Malta,MLT,Maltese 110 | MH,Marshall Islands,MHL,Marshallese 111 | MR,Mauritania,MRT,"Mauritanian,Mauritanians" 112 | MU,Mauritius,MUS,Mauritian 113 | MX,Mexico,MEX,"Mexican,Mexicans" 114 | FM,Micronesia,FSM,"Micronesian, Micronesians" 115 | MD,Moldova,MDA,"Moldovan,Moldovans" 116 | MC,Monaco,MCO,"Monegasque,Monegasques" 117 | MN,Mongolia,MNG,"Mongolian,Mongolians" 118 | ME,Montenegro,MNE,"Montenegrin,Montenegrins" 119 | MA,Morocco,MAR,"Moroccan,Moroccans,Western Sahara,Sahrawi" 120 | MZ,Mozambique,MOZ,"Mozambican,Mozambicans" 121 | MM,Myanmar [Burma],MMR,"Myanmar, Burmese" 122 | NA,Namibia,NAM,"Namibian,Namibians" 123 | NR,Nauru,NRU,Nauruan 124 | NP,Nepal,NPL,Nepalese 125 | NL,Netherlands,NLD,"Dutch,Aruba,Aruban,Arubans,Bonaire, Sint Eustatius and Saba, Curaçao, Curaçaoan,Curacao,Curacaoan,Sint Maarten" 126 | NZ,New Zealand,NZL,"New Zealander, Kiwi,New Zealanders, Kiwis,Niue,Niuean,Niuean,Tokelau,Tokelauan,Cook Islands, Cook Islander" 127 | NI,Nicaragua,NIC,"Nicaraguan,Nicaraguans" 128 | NE,Niger,NER,"Nigeran,Nigerans" 129 | NG,Nigeria,NGA,"Nigerian,Nigerians" 130 | KP,North Korea,PRK,"North Korean, North Koreans,DPRK" 131 | NO,Norway,NOR,"Norwegian,Norwegians,Bouvet Island,Bouvet Islanders,Svalbard and Jan Mayen,Svalbard" 132 | OM,Oman,OMN,"Omani,Omanis" 133 | PK,Pakistan,PAK,"Pakistani,Pakistanis" 134 | PW,Palau,PLW,"Palauan,Palauans" 135 | PS,Palestinian Territories,PSE,"Palestinian,Palestine,Palestinians" 136 | PA,Panama,PAN,"Panamanian,Panamanians" 137 | PG,Papua New Guinea,PNG,"Papua New Guinean,Papua New Guineans" 138 | PY,Paraguay,PRY,"Paraguayan,Paraguayans" 139 | PE,Peru,PER,"Peruvian,Peruvians" 140 | PH,Philippines,PHL,"Filipino,Filipinos" 141 | PL,Poland,POL,"Polish,Poles" 142 | PT,Portugal,PRT,Portuguese 143 | QA,Qatar,QAT,"Qatari,Qataris" 144 | RO,Romania,ROU,"Romanian,Romanians" 145 | RU,Russia,RUS,"Russian,Russians" 146 | RW,Rwanda,RWA,"Rwandan,Rwandans" 147 | KN,Saint Kitts and Nevis,KNA,Kittian and Nevisian 148 | LC,Saint Lucia,LCA,"Saint Lucian, Saint Lucians" 149 | VC,Saint Vincent and the Grenadines,VCT,Saint Vincentian 150 | WS,Samoa,WSM,"Samoan,Samoans" 151 | SM,San Marino,SMR,Sammarinese 152 | ST,Sao Tome and Principe,STP,"Sao Tomean,Sao Tomeans,São Tomé and Príncipe" 153 | SA,Saudi Arabia,SAU,"Saudi Arabia,Saudi Arabian,Saudi,Saudi Arabians,Saudis" 154 | SN,Senegal,SEN,Senegalese 155 | RS,Serbia,SRB,"Serbian,Serb,Serbs" 156 | SC,Seychelles,SYC,Seychellois 157 | SL,Sierra Leone,SLE,"Sierra Leonean,Sierra Leoneans" 158 | SG,Singapore,SGP,"Singaporean,Singaporeans" 159 | SK,Slovakia,SVK,"Slovak,Slovaks" 160 | SI,Slovenia,SVN,"Slovene,Slovenes" 161 | SB,Solomon Islands,SLB,"Solomon Islander,Solomon Islanders" 162 | SO,Somalia,SOM,"Somali,Somalis" 163 | ZA,South Africa,ZAF,"South African,South Africans" 164 | KR,South Korea,KOR,"South Korean, South Koreans" 165 | ES,Spain,ESP,"Spanish, Spaniard, Spaniards" 166 | LK,Sri Lanka,LKA,"Sri Lankan, Sri Lankans" 167 | SD,Sudan,SDN,Sudanese 168 | SR,Suriname,SUR,Surinamer 169 | SZ,Eswatini,SWZ,"Swazi,Swazis,Swaziland" 170 | SE,Sweden,SWE,"Swedish,Swede,Swedes" 171 | CH,Switzerland,CHE,Swiss 172 | SY,Syria,SYR,"Syrian,Syrians,Syrian Arab Republic" 173 | TW,Taiwan,TWN,"Taiwanese,Republic of China" 174 | TJ,Tajikistan,TJK,"Tadzhik,Tajik,Tajiks" 175 | TZ,Tanzania,TZA,"Tanzanian, Tanzanians, United Republic of Tanzania" 176 | TH,Thailand,THA,"Thai,Thais" 177 | TL,Timor-Leste,TLS,"East Timorese, East Timor" 178 | TG,Togo,TGO,Togolese 179 | TO,Tonga,TON,"Tongan,Tongan" 180 | TT,Trinidad and Tobago,TTO,"Trinidadian,Trinidadians" 181 | TN,Tunisia,TUN,"Tunisian,Tunisians" 182 | TR,Turkey,TUR,"Turkish,Turk,Turks" 183 | TM,Turkmenistan,TKM,"Turkmen,Turkmeni,Turkmens,Turkmenis" 184 | TV,Tuvalu,TUV,"Tuvaluan,Tuvaluans" 185 | UG,Uganda,UGA,"Ugandan, Ugandans" 186 | UA,Ukraine,UKR,"Ukrainian, Ukrainians" 187 | AE,United Arab Emirates,ARE,"Emirati, Emiratis" 188 | GB,United Kingdom,GBR,"British, English, England, UK, U.K., United Kingdom of Great Britain and Northern Ireland, Britain, Brits, Scottish, Welsh, Scotland, Wales, Great Britain, Anguilla,Anguillian, Anguillians,Bermuda,Bermudian, Bermudians,British Indian Ocean Territory,Cayman Islands,Cayman Islander, Caymanians,Caymanian,Falkland Islands,Falkland Islander,Falkland Islanders,Falklander,Falklanders,Falkland Islands (Malvinas),Islas Malvinas,Gibraltar,Gibraltar, Gibraltarian, Gibraltarians,Guernsey,Isle of Man,Manx,Channel Islander, Channel Islanders,Montserrat,Montserratian,Pitcairn Islands,Pitcairn Islander,Pitcairn Islanders,Saint Helena,Saint Helenian,Saint Helenians,South Georgia and the South Sandwich Islands,Turks and Caicos Islands,Turks and Caicos Islander,British Virgin Islands,British Virgin Island" 189 | NA,United Nations,UNO,"United Nations, UN, U.N." 190 | US,United States,USA,"American, US, U.S., United States,Americans,American Samoa,American Samoan,American Samoans,Guam,Guamanian, Guamanians,Northern Mariana Islands,Puerto Rico,Puerto Rican,Puerto Ricans,United States Minor Outlying Islands,Virgin Islands (U.S.), U.S. Virgin Islands,US Virgin Islands" 191 | UY,Uruguay,URY,"Uruguayan, Uruguayans" 192 | UZ,Uzbekistan,UZB,"Uzbekistani, Uzbekistanis" 193 | VU,Vanuatu,VUT,Ni-Vanuatu 194 | VA,Vatican City,VAT,Vatican City 195 | VE,Venezuela,VEN,"Venezuelan,Venezuelans" 196 | VN,Vietnam,VNM,Vietnamese 197 | YE,Yemen,YEM,"Yemeni,Yemenis" 198 | ZM,Zambia,ZMB,"Zambian,Zambians" 199 | ZW,Zimbabwe,ZWE,"Zimbabwean,Zimbabweans" 200 | ,South Ossetia,GEOPRE,South Ossetian 201 | ,Abkhazia,GEOPRE,"Republic of Abkhazia, Abkhaz" 202 | ,Transnistria,MDAPRE,Transnistrian 203 | ,Luhansk People's Republic,UKRPRE,Luhansk 204 | ,Donetsk People's Republic,UKRPRE,Donetsk 205 | ,Nagorno-Karabahk,AZEPRE,Republic of Artsakh 206 | ,Somaliland,SOMPRE,Somalilander 207 | ,Northern Cyprus,CYPPRE,North Cypriot -------------------------------------------------------------------------------- /NGEC/assets/countries.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/countries.numbers -------------------------------------------------------------------------------- /NGEC/assets/cow2iso.txt: -------------------------------------------------------------------------------- 1 | cow_id,cow3,iso_id,iso2,iso3,valid_from,valid_until,cname,cname_full,comments,statenme 2 | 2,USA,841,US,USA,1962,1980,USA (before 1981),USA and Puerto Rico,Including Puerto Rico,United States of America 3 | 2,USA,842,US,USA,1981,,USA,"USA, Puerto Rico and US Virgin Islands",Including Puerto Rico and US Virgin Islands,United States of America 4 | 20,CAN,124,CA,CAN,1962,,Canada,Canada,,Canada 5 | 31,BHM,44,BS,BHS,1962,,Bahamas,Bahamas,,Bahamas 6 | 40,CUB,192,CU,CUB,1962,,Cuba,Cuba,,Cuba 7 | 41,HAI,332,HT,HTI,1962,,Haiti,Haiti,,Haiti 8 | 42,DOM,214,DO,DOM,1962,,Dominican Rep.,Dominican Republic,,Dominican Republic 9 | 51,JAM,388,JM,JAM,1962,,Jamaica,Jamaica,,Jamaica 10 | 52,TRI,780,TT,TTO,1962,,Trinidad and Tobago,Trinidad and Tobago,,Trinidad and Tobago 11 | 53,BAR,52,BB,BRB,1962,,Barbados,Barbados,,Barbados 12 | 54,DMA,212,DM,DMA,1962,,Dominica,Dominica,,Dominica 13 | 55,GRN,308,GD,GRD,1962,,Grenada,Grenada,,Grenada 14 | 56,SLU,662,LC,LCA,1962,,Saint Lucia,Saint Lucia,,St. Lucia 15 | 57,SVG,670,VC,VCT,1962,,Saint Vincent and the Grenadines,Saint Vincent and the Grenadines,,St. Vincent and the Grenadines 16 | 58,AAB,28,AG,ATA,1962,,Antigua and Barbuda,Antigua and Barbuda,,Antigua & Barbuda 17 | 60,SKN,658,KN,KNA,1962,1980,"Saint Kitts, Nevis and Anguilla","Saint Kitts, Nevis and Anguilla",,St. Kitts and Nevis 18 | 60,SKN,659,KN,KNA,1981,,Saint Kitts and Nevis,Saint Kitts and Nevis,,St. Kitts and Nevis 19 | 70,MEX,484,MX,MEX,1962,,Mexico,Mexico,,Mexico 20 | 80,BLZ,84,BZ,BLZ,1962,,Belize,Belize,,Belize 21 | 90,GUA,320,GT,GTM,1962,,Guatemala,Guatemala,,Guatemala 22 | 91,HON,340,HN,HND,1962,,Honduras,Honduras,,Honduras 23 | 92,SAL,222,SV,SLV,1962,,El Salvador,El Salvador,,El Salvador 24 | 93,NIC,558,NI,NIC,1962,,Nicaragua,Nicaragua,,Nicaragua 25 | 94,COS,188,CR,CRI,1962,,Costa Rica,Costa Rica,,Costa Rica 26 | 95,PAN,591,PA,PAN,1978,,Panama,Panama,,Panama 27 | 95,PAN,590,PA,PAN,1962,1977,"Fmr Panama, excl.Canal Zone","Former Panama, excluding Canal Zone",,Panama 28 | 100,COL,170,CO,COL,1962,,Colombia,Colombia,,Colombia 29 | 101,VEN,862,VE,VEN,1962,,Venezuela,Venezuela,,Venezuela 30 | 110,GUY,328,GY,GUY,1962,,Guyana,Guyana,,Guyana 31 | 115,SUR,740,SR,SUR,1962,,Suriname,Suriname,,Suriname 32 | 130,ECU,218,EC,ECU,1962,,Ecuador,Ecuador,,Ecuador 33 | 135,PER,604,PE,PER,1962,,Peru,Peru,,Peru 34 | 140,BRA,76,BR,BRA,1962,,Brazil,Brazil,,Brazil 35 | 145,BOL,68,BO,BOL,1962,,Bolivia (Plurinational State of),Plurinational State of Bolivia,,Bolivia 36 | 150,PAR,600,PY,PRY,1962,,Paraguay,Paraguay,,Paraguay 37 | 155,CHL,152,CL,CHL,1962,,Chile,Chile,,Chile 38 | 160,ARG,32,AR,ARG,1962,,Argentina,Argentina,,Argentina 39 | 165,URU,858,UY,URY,1962,,Uruguay,Uruguay,,Uruguay 40 | 200,UKG,826,GB,GBR,1962,,United Kingdom,United Kingdom,,United Kingdom 41 | 205,IRE,372,IE,IRL,1962,,Ireland,Ireland,,Ireland 42 | 210,NTH,528,NL,NLD,1962,,Netherlands,Netherlands,,Netherlands 43 | 211,BEL,58,BE,BEL,1962,1998,Belgium-Luxembourg,Belgium-Luxembourg,,Belgium 44 | 211,BEL,56,BE,BEL,1999,,Belgium,Belgium,,Belgium 45 | 212,LUX,442,LU,LUX,1999,,Luxembourg,Luxembourg,,Luxembourg 46 | 220,FRN,251,FR,FRA,1962,,France,"France, Monaco",Including Monaco,France 47 | 221,MNC,,,,,,Monaco,,,Monaco 48 | 223,LIE,,,,,,Liechtenstein,,,Liechtenstein 49 | 225,SWZ,757,CH,CHE,1962,,Switzerland,"Switzerland, Liechtenstein",Liechtenstein,Switzerland 50 | 230,SPN,724,ES,ESP,1962,,Spain,Spain,,Spain 51 | 232,AND,20,AD,AND,1962,,Andorra,Andorra,,Andorra 52 | 235,POR,620,PT,PRT,1962,,Portugal,Portugal,,Portugal 53 | 240,HAN,,,,,,Hanover,,,Hanover 54 | 245,BAV,,,,,,Bavaria,,,Bavaria 55 | 255,GMY,276,DE,DEU,1991,,Germany,Germany,,Germany 56 | 260,GFR,280,DE,DEU,1962,1990,Fmr Fed. Rep. of Germany,Former Federal Republic of Germany,,German Federal Republic 57 | 265,GDR,278,DD,DDR,1962,1990,Fmr Dem. Rep. of Germany,Former Democratic Republic of Germany,,German Democratic Republic 58 | 267,BAD,,,,,,Baden,,,Baden 59 | 269,SAX,,,,,,Saxony,,,Saxony 60 | 271,WRT,,,,,,Wuerttemburg,,,Wuerttemburg 61 | 273,HSE,,,,,,Hesse Electoral,,,Hesse Electoral 62 | 275,HSG,,,,,,Hesse Grand Ducal,,,Hesse Grand Ducal 63 | 280,MEC,,,,,,Mecklenburg Schwerin,,,Mecklenburg Schwerin 64 | 290,POL,616,PL,POL,1962,,Poland,Poland,,Poland 65 | 300,AUH,,,,,,Austria-Hungary,,,Austria-Hungary 66 | 305,AUS,40,AT,AUT,1962,,Austria,Austria,,Austria 67 | 310,HUN,348,HU,HUN,1962,,Hungary,Hungary,,Hungary 68 | 315,CZE,200,CS,CSK,1962,1992,Czechoslovakia,Czechoslovakia,,Czechoslovakia 69 | 316,CZR,203,CZ,CZE,1993,,Czechia,Czechia,,Czech Republic 70 | 317,SLO,703,SK,SVK,1993,,Slovakia,Slovakia,,Slovakia 71 | 325,ITA,381,IT,ITA,1962,,Italy,Italy,,Italy 72 | 327,PAP,,,,,,Papal States,,,Papal States 73 | 329,SIC,,,,,,Two Sicilies,,,Two Sicilies 74 | 331,SNM,674,SM,SMR,2000,,San Marino,San Marino,,San Marino 75 | 332,MOD,,,,,,Modena,,,Modena 76 | 335,PMA,,,,,,Parma,,,Parma 77 | 337,TUS,,,,,,Tuscany,,,Tuscany 78 | 338,MLT,470,MT,MLT,1962,,Malta,Malta,,Malta 79 | 339,ALB,8,AL,ALB,1962,,Albania,Albania,,Albania 80 | 341,MNG,499,ME,MNE,2006,,Montenegro,Montenegro,,Montenegro 81 | 343,,446,MO,MAC,1962,,"China, Macao SAR","China, Macao Special Administrative Region",,Macedonia 82 | 343,MAC,807,MK,MKD,1993,,TFYR of Macedonia,The Former Yugoslav Republic of Macedonia,,Macedonia 83 | 344,CRO,191,HR,HRV,1992,,Croatia,Croatia,,Croatia 84 | 345,YUG,890,YU,YUG,1962,1991,Fmr Yugoslavia,Former Yugoslavia,In 1992 including TFYR of Macedonia,Yugoslavia 85 | 346,BOS,70,BA,BIH,1992,,Bosnia Herzegovina,Bosnia Herzegovina,,Bosnia and Herzegovina 86 | 347,KOS,,,,,,Kosovo,,,Kosovo 87 | 349,SLV,705,SI,SVN,1992,,Slovenia,Slovenia,,Slovenia 88 | 350,GRC,300,GR,GRC,1962,,Greece,Greece,,Greece 89 | 352,CYP,196,CY,CYP,1962,,Cyprus,Cyprus,,Cyprus 90 | 355,BUL,100,BG,BGR,1962,,Bulgaria,Bulgaria,,Bulgaria 91 | 359,MLD,498,MD,MDA,1992,,Rep. of Moldova,Republic of Moldova,,Moldova 92 | 360,ROM,642,RO,ROU,1962,,Romania,Romania,,Romania 93 | 365,RUS,643,RU,RUS,1992,,Russian Federation,Russian Federation,,Russia 94 | 366,EST,233,EE,EST,1992,,Estonia,Estonia,,Estonia 95 | 367,LAT,428,LV,LVA,1992,,Latvia,Latvia,,Latvia 96 | 368,LIT,440,LT,LTU,1992,,Lithuania,Lithuania,,Lithuania 97 | 369,UKR,804,UA,UKR,1992,,Ukraine,Ukraine,,Ukraine 98 | 370,BLR,112,BY,BLR,1992,,Belarus,Belarus,,Belarus 99 | 371,ARM,51,AM,ARM,1992,,Armenia,Armenia,,Armenia 100 | 372,GRG,268,GE,GEO,1992,,Georgia,Georgia,,Georgia 101 | 373,AZE,31,AZ,AZE,1992,,Azerbaijan,Azerbaijan,,Azerbaijan 102 | 375,FIN,246,FI,FIN,1962,,Finland,Finland,,Finland 103 | 380,SWD,752,SE,SWE,1962,,Sweden,Sweden,,Sweden 104 | 385,NOR,579,NO,NOR,1962,,Norway,"Norway, Svalbard and Jan Mayen",Including Svalbard and Jan Mayen,Norway 105 | 390,DEN,208,DK,DNK,1962,,Denmark,Denmark,,Denmark 106 | 395,ICE,352,IS,ISL,1962,,Iceland,Iceland,,Iceland 107 | 402,CAP,132,CV,CPV,1962,,Cabo Verde,Cabo Verde,,Cape Verde 108 | 403,STP,678,ST,STP,1962,,Sao Tome and Principe,Sao Tome and Principe,,Sao Tome and Principe 109 | 404,GNB,624,GW,GNB,1962,,Guinea-Bissau,Guinea-Bissau,,Guinea-Bissau 110 | 411,EQG,226,GQ,GNQ,1962,,Equatorial Guinea,Equatorial Guinea,,Equatorial Guinea 111 | 420,GAM,270,GM,GMB,1962,,Gambia,Gambia,,Gambia 112 | 432,MLI,466,ML,MLI,1962,,Mali,Mali,,Mali 113 | 433,SEN,686,SN,SEN,1962,,Senegal,Senegal,,Senegal 114 | 434,BEN,204,BJ,BEN,1962,,Benin,Benin,,Benin 115 | 435,MAA,478,MR,MRT,1962,,Mauritania,Mauritania,,Mauritania 116 | 436,NIR,562,NE,NER,1962,,Niger,Niger,,Niger 117 | 437,CDI,384,CI,CIV,1962,,Côte d'Ivoire,Côte d'Ivoire,,Ivory Coast 118 | 438,GUI,324,GN,GIN,1962,,Guinea,Guinea,,Guinea 119 | 439,BFO,854,BF,BFA,1962,,Burkina Faso,Burkina Faso,,Burkina Faso 120 | 450,LBR,430,LR,LBR,1962,,Liberia,Liberia,,Liberia 121 | 451,SIE,694,SL,SLE,1962,,Sierra Leone,Sierra Leone,,Sierra Leone 122 | 452,GHA,288,GH,GHA,1962,,Ghana,Ghana,,Ghana 123 | 461,TOG,768,TG,TGO,1962,,Togo,Togo,,Togo 124 | 471,CAO,120,CM,CMR,1962,,Cameroon,Cameroon,,Cameroon 125 | 475,NIG,566,NG,NGA,1962,,Nigeria,Nigeria,,Nigeria 126 | 481,GAB,266,GA,GAB,1962,,Gabon,Gabon,,Gabon 127 | 482,CEN,140,CF,CAF,1962,,Central African Rep.,Central African Republic,,Central African Republic 128 | 483,CHA,148,TD,TCD,1962,,Chad,Chad,,Chad 129 | 484,CON,178,CG,COG,1962,,Congo,Congo,,Congo 130 | 490,DRC,180,CD,COD,1962,,Dem. Rep. of the Congo,Democratic Republic of the Congo,,Democratic Republic of the Congo 131 | 500,UGA,800,UG,UGA,1962,,Uganda,Uganda,,Uganda 132 | 501,KEN,404,KE,KEN,1962,,Kenya,Kenya,,Kenya 133 | 510,TAZ,834,TZ,TZA,1965,,United Rep. of Tanzania,United Republic of Tanzania,,Tanzania 134 | 511,ZAN,,,,,,Zanzibar,,,Zanzibar 135 | 516,BUI,108,BI,BDI,1962,,Burundi,Burundi,,Burundi 136 | 517,RWA,646,RW,RWA,1962,,Rwanda,Rwanda,,Rwanda 137 | 520,SOM,706,SO,SOM,1962,,Somalia,Somalia,,Somalia 138 | 522,DJI,262,DJ,DJI,1962,,Djibouti,Djibouti,,Djibouti 139 | 530,ETH,230,ET,ETH,1962,1992,Fmr Ethiopia,Former Ethiopia,,Ethiopia 140 | 530,ETH,231,ET,ETH,1993,,Ethiopia,Ethiopia,,Ethiopia 141 | 531,ERI,232,ER,ERI,1993,,Eritrea,Eritrea,,Eritrea 142 | 540,ANG,24,AO,AGO,1962,,Angola,Angola,,Angola 143 | 541,MZM,508,MZ,MOZ,1962,,Mozambique,Mozambique,,Mozambique 144 | 551,ZAM,894,ZM,ZMB,1965,,Zambia,Zambia,,Zambia 145 | 552,ZIM,716,ZW,ZWE,1965,,Zimbabwe,Zimbabwe,,Zimbabwe 146 | 553,MAW,454,MW,MWI,1965,,Malawi,Malawi,,Malawi 147 | 560,SAF,710,ZA,ZAF,2000,,South Africa,South Africa,,South Africa 148 | 560,SAF,711,ZA,ZAF,1962,1999,So. African Customs Union,Southern African Customs Union,,South Africa 149 | 565,NAM,516,NA,NAM,2000,,Namibia,Namibia,,Namibia 150 | 570,LES,426,LS,LSO,2000,,Lesotho,Lesotho,,Lesotho 151 | 571,BOT,72,BW,BWA,2000,,Botswana,Botswana,,Botswana 152 | 572,SWA,748,SZ,SWZ,2000,,Swaziland,Swaziland,,Swaziland 153 | 580,MAG,450,MG,MDG,1962,,Madagascar,Madagascar,,Madagascar 154 | 581,COM,174,KM,COM,1962,,Comoros,Comoros,,Comoros 155 | 590,MAS,480,MU,MUS,1962,,Mauritius,Mauritius,,Mauritius 156 | 591,SEY,690,SC,SYC,1962,,Seychelles,Seychelles,,Seychelles 157 | 600,MOR,504,MA,MAR,1962,,Morocco,Morocco,,Morocco 158 | 615,ALG,12,DZ,DZA,1962,,Algeria,Algeria,,Algeria 159 | 616,TUN,788,TN,TUN,1962,,Tunisia,Tunisia,,Tunisia 160 | 620,LIB,434,LY,LBY,1962,,Libya,Libya,,Libya 161 | 625,SUD,729,SD,SDN,2012,,Sudan,Sudan,"Refers to the new Sudan, now excluding the southern part.",Sudan 162 | 625,SUD,736,SD,SDN,1962,2011,Fmr Sudan,Former Sudan,,Sudan 163 | 626,SSD,728,SS,SSD,2012,,South Sudan,South Sudan,,South Sudan 164 | 630,IRN,364,IR,IRN,1962,,Iran,Iran,,Iran 165 | 640,TUR,792,TR,TUR,1962,,Turkey,Turkey,,Turkey 166 | 645,IRQ,368,IQ,IRQ,1962,,Iraq,Iraq,,Iraq 167 | 651,EGY,818,EG,EGY,1962,,Egypt,Egypt,,Egypt 168 | 652,SYR,760,SY,SYR,1962,,Syria,Syria,,Syria 169 | 660,LEB,422,LB,LBN,1962,,Lebanon,Lebanon,,Lebanon 170 | 663,JOR,400,JO,JOR,1962,,Jordan,Jordan,,Jordan 171 | 666,ISR,376,IL,ISR,1962,,Israel,Israel,,Israel 172 | 670,SAU,682,SA,SAU,1962,,Saudi Arabia,Saudi Arabia,,Saudi Arabia 173 | 678,YAR,886,YE,YEM,1962,1990,Fmr Arab Rep. of Yemen,Former Arab Republic of Yemen,,Yemen Arab Republic 174 | 679,YEM,887,YE,YEM,1991,,Yemen,Yemen,,Yemen 175 | 680,YPR,720,YD,YMD,1962,1990,Fmr Dem. Yemen,Former Democratic Yemen,,Yemen People's Republic 176 | 690,KUW,414,KW,KWT,1962,,Kuwait,Kuwait,,Kuwait 177 | 692,BAH,48,BH,BHR,1962,,Bahrain,Bahrain,,Bahrain 178 | 694,QAT,634,QA,QAT,1962,,Qatar,Qatar,,Qatar 179 | 696,UAE,784,AE,ARE,1962,,United Arab Emirates,United Arab Emirates,,United Arab Emirates 180 | 698,OMA,512,OM,OMN,1962,,Oman,Oman,,Oman 181 | 700,AFG,4,AF,AFG,1962,,Afghanistan,Afghanistan,,Afghanistan 182 | 701,TKM,795,TM,TKM,1992,,Turkmenistan,Turkmenistan,,Turkmenistan 183 | 702,TAJ,762,TJ,TJK,1992,,Tajikistan,Tajikistan,,Tajikistan 184 | 703,KYR,417,KG,KGZ,1992,,Kyrgyzstan,Kyrgyzstan,,Kyrgyzstan 185 | 704,UZB,860,UZ,UZB,1992,,Uzbekistan,Uzbekistan,,Uzbekistan 186 | 705,KZK,398,KZ,KAZ,1992,,Kazakhstan,Kazakhstan,,Kazakhstan 187 | 710,CHN,156,CN,CHN,1962,,China,China,,China 188 | 712,MON,496,MN,MNG,1962,,Mongolia,Mongolia,,Mongolia 189 | 713,TAW,,,,,,Taiwan,,,Taiwan 190 | 730,KOR,,,,,,Korea,,,Korea 191 | 731,PRK,408,KP,PRK,1962,,Dem. People's Rep. of Korea,Democratic People's Republic of Korea,,North Korea 192 | 732,ROK,410,KR,KOR,1962,,Rep. of Korea,Republic of Korea,,South Korea 193 | 740,JPN,392,JP,JPN,1962,,Japan,Japan,,Japan 194 | 750,IND,356,IN,IND,1962,1974,"India, excl. Sikkim","India, excluding Sikkim",,India 195 | 750,IND,699,IN,IND,1975,,India,India,,India 196 | 760,BHU,64,BT,BTN,1962,,Bhutan,Bhutan,,Bhutan 197 | 770,PAK,586,PK,PAK,1972,,Pakistan,Pakistan,,Pakistan 198 | 770,PAK,588,PK,PAK,1962,1971,East and West Pakistan,East and West Pakistan,,Pakistan 199 | 771,BNG,50,BD,BGD,1972,,Bangladesh,Bangladesh,,Bangladesh 200 | 775,MYA,104,MM,MMR,1962,,Myanmar,Myanmar,,Myanmar 201 | 780,SRI,144,LK,LKA,1962,,Sri Lanka,Sri Lanka,,Sri Lanka 202 | 781,MAD,462,MV,MDV,1962,,Maldives,Maldives,,Maldives 203 | 790,NEP,524,NP,NPL,1962,,Nepal,Nepal,,Nepal 204 | 800,THI,764,TH,THA,1962,,Thailand,Thailand,,Thailand 205 | 811,CAM,116,KH,KHM,1962,,Cambodia,Cambodia,,Cambodia 206 | 812,LAO,418,LA,LAO,1962,,Lao People's Dem. Rep.,Lao People's Dem. Rep.,,Laos 207 | 816,DRV,,,,,,Vietnam,,,Vietnam 208 | 817,RVN,868,VN,VNM,1962,1974,Fmr Rep. of Vietnam,Former Republic of Vietnam,,Republic of Vietnam 209 | 817,RVN,704,VN,VNM,1975,,Viet Nam,Viet Nam,,Republic of Vietnam 210 | 820,MAL,458,MY,MYS,1964,,Malaysia,Malaysia,,Malaysia 211 | 830,SIN,702,SG,SGP,1962,,Singapore,Singapore,,Singapore 212 | 835,BRU,96,BN,BRN,1962,,Brunei Darussalam,Brunei Darussalam,,Brunei 213 | 840,PHI,608,PH,PHL,1962,,Philippines,Philippines,,Philippines 214 | 850,INS,360,ID,IDN,1962,,Indonesia,Indonesia,,Indonesia 215 | 860,ETM,626,TL,TLS,1962,,Timor-Leste,Timor-Leste,,East Timor 216 | 900,AUL,36,AU,AUS,1962,,Australia,Australia,,Australia 217 | 910,PNG,598,PG,PNG,1962,,Papua New Guinea,Papua New Guinea,,Papua New Guinea 218 | 920,NEW,554,NZ,NZL,1962,,New Zealand,New Zealand,,New Zealand 219 | 935,VAN,548,VU,VUT,1962,,Vanuatu,Vanuatu,,Vanuatu 220 | 940,SOL,90,SB,SLB,1962,,Solomon Isds,Solomon Islands,,Solomon Islands 221 | 946,KIR,296,KI,KIR,1962,,Kiribati,Kiribati,,Kiribati 222 | 947,TUV,798,TV,TUV,1962,,Tuvalu,Tuvalu,,Tuvalu 223 | 950,FIJ,242,FJ,FJI,1962,,Fiji,Fiji,,Fiji 224 | 955,TON,776,TO,TON,1962,,Tonga,Tonga,,Tonga 225 | 970,NAU,520,NR,NRU,1962,,Nauru,Nauru,,Nauru 226 | 983,MSI,584,MH,MHL,1992,,Marshall Isds,Marshall Islands,,Marshall Islands 227 | 986,PAL,585,PW,PLW,1992,,Palau,Palau,,Palau 228 | 987,FSM,583,FM,FSM,1992,,FS Micronesia,Federated State of Micronesia,,Federated States of Micronesia 229 | 990,WSM,882,WS,WSM,1962,,Samoa,Samoa,,Samoa 230 | ,,533,AW,ABW,1988,,Aruba,Aruba,, 231 | ,,660,AI,AIA,1981,,Anguilla,Anguilla,, 232 | ,,532,AN,ANT,1962,1987,Neth. Antilles and Aruba,Netherlands Antilles and Aruba,, 233 | ,,530,AN,ANT,1988,2010,Neth. Antilles,Netherlands Antilles,, 234 | ,,16,AS,ASM,1962,,American Samoa,American Samoa,, 235 | ,,10,AQ,ATA,1962,,Antarctica,Antarctica,, 236 | ,,260,FQ,ATF,1962,,Fr. South Antarctic Terr.,French South Antarctic Territories,, 237 | ,,535,BQ,BES,2011,,Bonaire,"Bonaire, Saint Eustatius and Saba",, 238 | ,,652,BL,BLM,2013,,Saint Barthélemy,Saint Barthélemy,, 239 | ,,60,BM,BMU,1962,,Bermuda,Bermuda,, 240 | ,,166,CC,CCK,1962,,Cocos Isds,Cocos Islands,, 241 | ,,184,CK,COK,1962,,Cook Isds,Cook Islands,, 242 | ,,531,CW,CUW,2010,,Curaçao,Curaçao,, 243 | ,,162,CX,CXR,1962,,Christmas Isds,Christmas Islands,, 244 | ,,136,KY,CYM,1962,,Cayman Isds,Cayman Islands,, 245 | ,,732,EH,ESH,1962,,Western Sahara,Western Sahara,, 246 | ,,97,EU,EU2,1962,,EU-28,EU-28,, 247 | ,,238,FK,FLK,1962,,Falkland Isds (Malvinas),Falkland Islands (Malvinas),, 248 | ,,234,FO,FRO,1962,,Faeroe Isds,Faeroe Islands,, 249 | ,,292,GI,GIB,1962,,Gibraltar,Gibraltar,, 250 | ,,312,GP,GLP,1962,1995,Guadeloupe,Guadeloupe,, 251 | ,,304,GL,GRL,1962,,Greenland,Greenland,, 252 | ,,254,GF,GUF,1962,1995,French Guiana,French Guiana,, 253 | ,,316,GU,GUM,1962,,Guam,Guam,, 254 | ,,344,HK,HKG,1962,,"China, Hong Kong SAR","China, Hong Kong Special Administrative Region",, 255 | ,,334,HM,HMD,1962,,Heard Island and McDonald Islands,Heard Island and McDonald Islands,, 256 | ,,86,IO,IOT,1962,,Br. Indian Ocean Terr.,British Indian Ocean Territories,, 257 | ,,580,MP,MNP,1992,,N. Mariana Isds,Northern Mariana Islands,, 258 | ,,500,MS,MSR,1962,,Montserrat,Montserrat,, 259 | ,,474,MQ,MTQ,1962,1995,Martinique,Martinique,, 260 | ,,175,YT,MYT,1962,,Mayotte,Mayotte,, 261 | ,,129,N/A,N/A,1962,2004,"Caribbean, nes","Caribbean, not elsewhere specified",, 262 | ,,838,N/A,N/A,1962,,Free Zones,Free Zones,, 263 | ,,221,N/A,N/A,1962,2004,"Eastern Europe, nes","Eastern Europe, not elsewhere specified",, 264 | ,,837,N/A,N/A,1962,,Bunkers,Bunkers,, 265 | ,,835,N/A,N/A,1962,1964,Fmr Tanganyika,Former Tanganyika,, 266 | ,,490,N/A,N/A,1962,,"Other Asia, nes","Other Asia, not elsewhere specified",, 267 | ,,577,N/A,N/A,1962,,"Other Africa, nes","Other Africa, not elsewhere specified",, 268 | ,,849,N/A,N/A,1962,1962,US Misc. Pacific Isds,US Miscellaneous Pacific Islands,, 269 | ,,290,N/A,N/A,1962,2004,"Northern Africa, nes","Northern Africa, not elsewhere specified",, 270 | ,,459,N/A,N/A,1962,1963,Peninsula Malaysia,Peninsula Malaysia,, 271 | ,,836,N/A,N/A,1962,1964,Fmr Zanzibar and Pemba Isd,Former Zanzibar and Pemba Island,, 272 | ,,717,N/A,N/A,1962,1964,Fmr Rhodesia Nyas,Former Rhodesia Nyas,, 273 | ,,527,N/A,N/A,1962,,"Oceania, nes","Oceania, not elsewhere specified",, 274 | ,,899,N/A,N/A,1962,,"Areas, nes","Areas, not elsewhere specified",, 275 | ,,473,N/,N/A,1962,,"LAIA, nes","LAIA, not elsewhere specified",, 276 | ,,647,N/A,N/A,1962,1972,Ryukyu Isd,Ryukyu Island,, 277 | ,,698,N/A,N/A,1962,1974,Sikkim,Sikkim,, 278 | ,,492,N/A,N/A,1962,2004,"Europe EU, nes","Europe EU, not elsewhere specified",Code 492 is mapped to code 568 since 2005, 279 | ,,80,N/A,N/A,1962,,Br. Antarctic Terr.,British Antarctic Territories,, 280 | ,,74,N/A,N/A,1962,,Bouvet Island,Ile Bouvet,, 281 | ,,568,N/A,N/A,1962,,"Other Europe, nes","Other Europe, not elsewhere specified",, 282 | ,,697,N/A,N/A,1962,2004,"Europe EFTA, nes","Europe EFTA, not elsewhere specified",, 283 | ,,879,N/A,N/A,1962,2004,"Western Asia, nes","Western Asia, not elsewhere specified",, 284 | ,,636,N/A,N/A,1962,2004,"Rest of America, nes","Rest of America, not elsewhere specified",, 285 | ,,839,N/A,N/A,1962,,Special Categories,Special Categories,, 286 | ,,471,N/A,N/A,1962,2004,"CACM, nes","CACM, not elsewhere specified",, 287 | ,,472,N/A,N/A,1962,2004,"Africa CAMEU region, nes","Africa CAMEU region, not elsewhere specified",, 288 | ,,461,N/A,N/A,1962,1963,Sabah,Sabah,, 289 | ,,637,N/A,N/A,1962,,"North America and Central America, nes","North America, the Caribbean and Central America not elsewhere specified",, 290 | ,,536,N/A,N/A,1962,,Neutral Zone,Neutral Zone,, 291 | ,,457,N/A,N/A,1962,1963,Sarawak,Sarawak,, 292 | ,,540,NC,NCL,1962,,New Caledonia,New Caledonia,, 293 | ,,574,NF,NFK,1962,,Norfolk Isds,Norfolk Islands,, 294 | ,,570,NU,NIU,1962,,Niue,Niue,, 295 | ,,582,PC,PCI,1962,1991,Fmr Pacific Isds,Former Pacific Islands,, 296 | ,,612,PN,PCN,1962,,Pitcairn,Pitcairn,, 297 | ,,592,PZ,PCZ,1962,1977,Fmr Panama-Canal-Zone,Former Panama-Canal-Zone,, 298 | ,,275,PS,PSE,2000,,State of Palestine,State of Palestine,, 299 | ,,258,PF,PYF,1962,,French Polynesia,French Polynesia,, 300 | ,,638,RE,REU,1962,1995,Réunion,Réunion,, 301 | ,,891,CS,SCG,1992,2005,Serbia and Montenegro,Serbia and Montenegro,, 302 | ,,239,GS,SGS,1962,,South Georgia and the South Sandwich Islands,South Georgia and the South Sandwich Islands,, 303 | ,,654,SH,SHN,1962,,Saint Helena,Saint Helena,, 304 | ,,666,PM,SPM,1962,,Saint Pierre and Miquelon,Saint Pierre and Miquelon,, 305 | ,,688,RS,SRB,2006,,Serbia,Serbia,, 306 | ,,810,SU,SUN,1962,1991,Fmr USSR,Former USSR,, 307 | ,,534,SX,SXM,2010,,Saint Maarten,Saint Maarten (Dutch part),, 308 | ,,796,TC,TCA,1962,,Turks and Caicos Isds,Turks and Caicos Islands,, 309 | ,,772,TK,TKL,1962,,Tokelau,Tokelau,, 310 | ,,581,UM,UMI,1962,,United States Minor Outlying Islands,United States Minor Outlying Islands,United States Minor Outlying Islands, 311 | ,,336,VA,VAT,2000,,Holy See (Vatican City State),Holy See (Vatican City State),, 312 | ,,866,VD,VDR,1962,1974,Fmr Dem. Rep. of Vietnam,Former Democratic Republic of Vietnam,, 313 | ,,92,VG,VGB,1962,,Br. Virgin Isds,British Virgin Islands,, 314 | ,,850,VI,VIR,1962,1980,US Virgin Isds,US Virgin Islands,, 315 | ,,0,WL,WLD,1962,,World,World,, 316 | ,,876,WF,WLF,1962,,Wallis and Futuna Isds,Wallis and Futuna Islands,, 317 | -------------------------------------------------------------------------------- /NGEC/assets/event_mode_questions.csv: -------------------------------------------------------------------------------- 1 | event_type,mode,ACTOR,RECIP,LOCATION,TIME,notes 2 | ASSAULT,,"Who carried out an attack on {recip_text}? 3 | Who attacked {recip_text}?","Who was attacked by {actor_text}? 4 | Who did {actor_text} attack?",Where did the attack take place?,When did the attack take place?, 5 | COERCE,,Who coerced {recip_text}?,"Who was coerced by {actor_text}? 6 | Who did {actor_text} coerce?",Where did the coercion occur?,When did the coercion occur?, 7 | PROTEST,,"Who protested? 8 | Who protested against {recip_text}?","Who was the protest directed against? 9 | Who was {actor_text} protesting against?",Where did the protest take place?,When did the protest take place?, 10 | MOBILIZE,,"Who mobilized forces? 11 | Who mobilized forces against {recip_text}?","Who did {actor_text} mobilize forces against? 12 | Who was the mobilization directed against?",Where were forces mobilized?,When were forces mobilized?, 13 | SANCTION,,"Who sanctioned {recip_text}? 14 | Who punished {recip_text}? 15 | Who imposed sanctions against {recip_text}?","Who was sanctioned? 16 | Who was sanctioned by {actor_text}?",Where were sanctions imposed?,When were sanctions imposed?, 17 | THREATEN,,"Who threatened {recip_text}? 18 | Who made a threat against {recip_text}?","Who was threatened? 19 | Who was threatened by {actor_text}?",Where was the threat made?,When was the threat made?, 20 | REQUEST,,"Who made a request? 21 | Who requested something from {recip_text}?","Who did {actor_text} request something from? 22 | Who was the request directed toward?",Where was the request made?,When was the request made?, 23 | REJECT,,Who rejected something?,"Whose claim was rejected by someone? 24 | Whose claim was rejected by {actor_text}?",Where did the rejection occur?,When did the rejection occur?, 25 | ACCUSE,,"Who accused {recip_text} of something? 26 | Who accused {recip_text}?","Who was the accusation against? 27 | Who did {actor_text} accuse of something?",Where did the accusation occur?,When did the accusation occur?, 28 | CONSULT,,"Who consulted? 29 | Who did {recip_text} consult with?","Who consulted? 30 | Who did {actor_text} consult with? 31 | Who was consulted?",Where did the consultation occur?,When did the consultation occur?, 32 | AGREE,,Who agreed with {recip_text}?,Who did {actor_text} agree with?,Where was the agreement reached?,When was the agreement reached?, 33 | SUPPORT,,"Who gave support to {recip_text}? 34 | Who expressed support for {recip_text}?","Who received support? 35 | Who was supported by {actor_text}?",Where was support expressed?,When was support expressed?, 36 | CONCEDE,,"Who made a concession? 37 | Who made a concession to {recip_text}?","Who was the concession made to? 38 | Who did {actor_text} make a concession to?",Where were concessions made?,When were concessions made?, 39 | COOPERATE,,"Who cooperated? 40 | Who cooperated with {recip_text}?","Who cooperated? 41 | Who cooperated with {actor_text}?",Where was the agreement signed?,When was the agreement signed?, 42 | AID,,"Who gave aid? 43 | Who gave aid to {recip_text}?","Who received aid? 44 | Who got aid from {actor_text}?",Where was the aid delivered?,When was the aid delivered?, 45 | RETREAT,,"Who retreated or gave something up? 46 | Who retreated from {recip_text}?","Who did someone retreat from? 47 | Who did {actor_text} retreat from?",Where did the retreat occur?,When did the retreat occur?, 48 | COERCE,seize,Who seized something from {recip_text}?,Who did {actor_text} seize something from?,Where did the seizure of property take place?,When did the seizure of property take place?, 49 | COERCE,restrict,"Who imposed restrictions? 50 | Who restricted the rights of {recip_text}?",Whose rights were restricted by {actor_text}?,Where were restrictions imposed?,When were restrictions imposed?, 51 | COERCE,ban,Who banned {recip_text}?,Who did {actor_text} ban?,Where was the ban applied?,When was the ban applied?, 52 | COERCE,censor,"Who did the censoring? 53 | Who censored {recip_text}?","Who did {actor_text} censor? 54 | Who was censored?",Where was censoring applied?,When was censoring applied?, 55 | COERCE,curfew,"Who imposed a curfew? 56 | Who restricted {recip_text}'s movement?","Who did {actor_text} impose a curfew on? 57 | Whose movement was restricted? 58 | Whose movement was restricted by {actor_text}?",Where was the curfew imposed?,When was the curfew imposed?, 59 | COERCE,martial-law,Who declared martial law?,Who did {actor_text} impose martial law on?,Where was martial law declared?,When was martial law declared?, 60 | COERCE,arrest,"Who arrested {recip_text}? 61 | Who detained {recip_text}?","Who was arrested by {actor_text}? 62 | Who was detained by {actor_text}?",Where were arrests made?,When were arrests made?, 63 | COERCE,deport,Who deported {recip_text}?,Who did {actor_text} deport?,Where was the deportation made from?,When was the deportation made from?, 64 | COERCE,withhold,Who withheld or shut off supplies for {recip_text}?,Who did {actor_text} withhold or shut off supplies to?,Where were supplies restricted?,When were supplies restricted?, 65 | COERCE,misinformation,Who carried out a misinformation campaign against {recip_text}?,Who did {actor_text} carry out a misinformation campaign against?,Where did the misinformation campaign occur?,When did the misinformation campaign occur?, -------------------------------------------------------------------------------- /NGEC/assets/event_models/ACCUSE.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/ACCUSE.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/AGREE.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/AGREE.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/AID.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/AID.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/ASSAULT.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/ASSAULT.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/COERCE.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/COERCE.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/CONCEDE.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/CONCEDE.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/CONSULT.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/CONSULT.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/COOPERATE.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/COOPERATE.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/MOBILIZE.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/MOBILIZE.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/PROTEST.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/PROTEST.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/REJECT.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/REJECT.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/REQUEST.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/REQUEST.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/RETREAT.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/RETREAT.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/SANCTION.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/SANCTION.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/SUPPORT.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/SUPPORT.skops -------------------------------------------------------------------------------- /NGEC/assets/event_models/THREATEN.skops: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/THREATEN.skops -------------------------------------------------------------------------------- /NGEC/assets/option_model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/option_model.pt -------------------------------------------------------------------------------- /NGEC/assets/pattern_matrix.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/pattern_matrix.npy -------------------------------------------------------------------------------- /NGEC/assets/pattern_matrix.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/pattern_matrix.pkl -------------------------------------------------------------------------------- /NGEC/context_class.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | 3 | def _load_model(model_dir): 4 | """ 5 | Load the context classification models. 6 | 7 | Parameters 8 | ---------- 9 | model_dir: Path 10 | path to the event classification models 11 | 12 | Returns 13 | ------ 14 | model_dict: dict 15 | With event classes as keys and models as values. 16 | """ 17 | raise NotImplementedError() 18 | 19 | 20 | class EventClass: 21 | def __init__(self, 22 | model_dir="assets/event_class_models/", 23 | threshold=0.6 # we can set stuff like this here 24 | ): 25 | 26 | self.model_dict = _load_model(model_dir) 27 | self.threshold = threshold 28 | 29 | 30 | def process(self, story_list): 31 | """ 32 | Process a list of stories to detect the document context. 33 | 34 | Example 35 | ------- 36 | The input is a list of dictionaries, each with an 'event_text' key with the full text of the story. 37 | 38 | {'date': '2019-08-01', 39 | 'event_text': 'Indonesia is investigating a report that ... ', 40 | 'headline': 'Indonesia says it is probing a report of a ...', 41 | 'id': '', 42 | 'pub_date': '2019-08-01', 43 | 'publisher': '', 44 | 'story_id': '', 45 | 'version': ''} 46 | 47 | Parameters 48 | ---------- 49 | story_list: list of dicts 50 | Each dictionary must have a 'event_text' key with the full text of the story. 51 | 52 | Returns 53 | ------- 54 | story_list: list of dicts 55 | Each story dictionary now contains an "contexts" key with a list of detected contexts (str). E.g.: 56 | 'contexts': ['diplomatic', 'legal'] 57 | 58 | 59 | """ 60 | raise NotImplementedError() 61 | 62 | for story in stories: 63 | if event_text not in story.keys(): 64 | raise ValueError("No 'event_text' key in input.") -------------------------------------------------------------------------------- /NGEC/event_class.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | from sklearn.linear_model import LogisticRegression 3 | # a safer pickle alternative 4 | import skops.io as sio 5 | import numpy as np 6 | 7 | 8 | class EventClass: 9 | def __init__(self, 10 | model_dir="NGEC/assets/event_models/", 11 | threshold=0.6, 12 | progress_bar=False, 13 | event_types = ['ACCUSE', 'AGREE', 'AID', 'ASSAULT', 'COERCE', 'CONCEDE', 14 | 'CONSULT', 'COOPERATE', 'MOBILIZE', 'PROTEST', 'REJECT', 'REQUEST', 15 | 'RETREAT', 'SANCTION', 'SUPPORT', 'THREATEN'] 16 | ): 17 | self.model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2') 18 | self.model_dir = model_dir 19 | self.threshold = threshold 20 | self.progress_bar = progress_bar 21 | self.event_types = event_types 22 | self.model_dict = self._load_model(model_dir) 23 | print("Event classification models loaded. NOTE: these models are not the production models used to produce the POLECAT dataset. Instead, these are demonstration models for the PLOVER ontology trained on synthetic text. If you are making custom event data, you'll need to train your own models. See the `setup` directory in the NGEC repo (github.com/ahalterman/NGEC)`.") 24 | 25 | def _load_model(self, model_dir): 26 | """ 27 | Load the event classification models. 28 | 29 | Parameters 30 | ---------- 31 | model_dir: Path 32 | path to the event classification models 33 | 34 | Returns 35 | ------ 36 | model_dict: dict 37 | With event classes as keys and models as values. 38 | """ 39 | model_dict = {} 40 | for event in self.event_types: 41 | model_dict[event] = sio.load(f"{model_dir}/{event}.skops") 42 | return model_dict 43 | 44 | def process(self, story_list): 45 | """ 46 | Process a list of stories to detect the event class. 47 | 48 | Example 49 | ------- 50 | The input is a list of dictionaries, each with an 'event_text' key with the full text of the story. 51 | 52 | {'date': '2019-08-01', 53 | 'event_text': 'Indonesia is investigating a report that ... ', 54 | 'headline': 'Indonesia says it is probing a report of a ...', 55 | 'id': '', 56 | 'pub_date': '2019-08-01', 57 | 'publisher': '', 58 | 'story_id': '', 59 | 'version': ''} 60 | 61 | Parameters 62 | ---------- 63 | story_list: list of dicts 64 | Each dictionary must have a 'text' key with the full text of the story. 65 | 66 | Returns 67 | ------- 68 | story_list: list of dicts 69 | Each story dictionary now contains an 'event_type' key with a list of detected events (str). E.g.: 70 | 'event_type': ['SANCTION', 'MOBILIZE'] 71 | 72 | """ 73 | text = [i['event_text'] for i in story_list] 74 | embeddings = self.model.encode(text, show_progress_bar=self.progress_bar) 75 | 76 | preds = [] 77 | for event, clf in self.model_dict.items(): 78 | y_pred = clf.predict_proba(embeddings)[:,1] 79 | preds.append(y_pred) 80 | 81 | pred_array = np.array(preds).T 82 | 83 | # convert the matrix of predictions to a list of lists 84 | preds = [] 85 | for i in pred_array: 86 | preds.append([self.event_types[j] for j in np.where(i > self.threshold)[0]]) 87 | 88 | for n, story in enumerate(story_list): 89 | story['event_type'] = preds[n] 90 | 91 | return story_list 92 | -------------------------------------------------------------------------------- /NGEC/formatter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rich import print 3 | import jsonlines 4 | import numpy as np 5 | import pandas as pd 6 | import os 7 | import dateparser 8 | from datetime import datetime 9 | import re 10 | 11 | import logging 12 | logger = logging.getLogger(__name__) 13 | logger.addHandler(logging.NullHandler()) 14 | 15 | # silence dateparser warning. https://github.com/scrapinghub/dateparser/issues/1013 16 | import warnings 17 | warnings.filterwarnings( 18 | "ignore", 19 | message="The localize method is no longer necessary, as this time zone supports the fold attribute", 20 | ) 21 | 22 | 23 | def country_name_dict(base_path): 24 | file = os.path.join(base_path, "countries.csv") 25 | countries = pd.read_csv(file) 26 | country_name_dict = {i:j for i, j in zip(countries['CCA3'], countries['Name'])} 27 | country_name_dict.update({"": ""}) 28 | country_name_dict.update({"IGO": "Intergovernmental Organization"}) 29 | return country_name_dict 30 | 31 | def resolve_date(event): 32 | """ 33 | Create a new 'date_resolved' key with a date in YYYY-MM-DD format 34 | 35 | TODO: 36 | include granularity details (e.g. month, year.)? 37 | >>> DateDataParser().get_date_data('March 2015') 38 | DateData(date_obj=datetime.datetime(2015, 3, 16, 0, 0), period='month', locale='en') 39 | """ 40 | if 'DATE' not in event['attributes'].keys(): 41 | pub_date = dateparser.parse(event['pub_date']).strftime("%Y-%m-%d") 42 | event['date_resolved'] = pub_date 43 | event['date_raw'] = "No date detected--using publication date" 44 | return event 45 | if not event['attributes']['DATE']: 46 | pub_date = dateparser.parse(event['pub_date']).strftime("%Y-%m-%d") 47 | event['date_resolved'] = pub_date 48 | event['date_raw'] = "" 49 | return event 50 | 51 | base_date = dateparser.parse(event['pub_date']) 52 | raw_date = event['attributes']['DATE'][0]['text'] 53 | print(f"raw_date: {raw_date}") 54 | 55 | resolved_date = dateparser.parse(date_string=raw_date, settings={'RELATIVE_BASE': base_date, 56 | 'PREFER_DATES_FROM': "past"}) 57 | if not resolved_date: 58 | if re.search("next|later", raw_date): 59 | raw_date = re.sub(r"next|later", "", raw_date).strip() 60 | resolved_date = dateparser.parse(date_string=raw_date, settings={'RELATIVE_BASE': base_date, 61 | 'PREFER_DATES_FROM': "future"}) 62 | if resolved_date: 63 | event['date_resolved'] = resolved_date.strftime("%Y-%m-%d") 64 | event['date_raw'] = raw_date 65 | return event 66 | if not resolved_date: 67 | event['date_resolved'] = event['pub_date'] 68 | event['date_raw'] = "" 69 | return event 70 | else: 71 | event['date_resolved'] = resolved_date.strftime("%Y-%m-%d") 72 | event['date_raw'] = raw_date 73 | return event 74 | 75 | 76 | class Formatter: 77 | def __init__(self, quiet=False, base_path="assets", geolocation_threshold=0.85): 78 | self.quiet = quiet 79 | self.base_path = base_path 80 | self.iso_to_name = country_name_dict(self.base_path) 81 | self.geo_threshold = geolocation_threshold 82 | 83 | """ 84 | event = { 'attributes': { 'ACTOR': [{ 'qa_end_char': 53, 85 | 'qa_score': 0.31743326783180237, 86 | 'qa_start_char': 39, 87 | 'text': 'Nicolas Maduro', 88 | 'score': 0.23675884306430817, 89 | 'wiki': 'Nicolás Maduro', 90 | 'country': 'VEN', 91 | 'code_1': 'ELI', 92 | 'code_2': ''}], 93 | 'LOC': [{ 'qa_end_char': 156, 94 | 'qa_score': 0.4355418384075165, 95 | 'qa_start_char': 148, 96 | 'text': 'Barbados'}], 97 | 'RECIP': [{ 'qa_end_char': 90, 98 | 'qa_score': 0.1324695497751236, 99 | 'qa_start_char': 79, 100 | 'score': 0.13248120248317719, 101 | 'wiki': 'Juan Guaidó', 102 | 'country': 'VEN', 103 | 'code_1': 'REB', 104 | 'code_2': '', 105 | 'text': 'Juan Guaidó'}]}, 106 | 'contexts': ['pro_democracy'], 107 | 'date': '2019-08-01', 108 | 'event_geolocation': { 'admin1_code': '00', 109 | 'admin1_name': '', 110 | 'admin2_code': '', 111 | 'admin2_name': '', 112 | 'country_code3': 'BRB', 113 | 'end_char': 156, 114 | 'event_location_overlap_score': 1.0, 115 | 'feature_class': 'A', 116 | 'feature_code': 'PCLI', 117 | 'geonameid': '3374084', 118 | 'lat': 13.16453, 119 | 'lon': -59.55165, 120 | 'resolved_placename': 'Barbados', 121 | 'score': 1.0, 122 | 'search_placename': 'Barbados', 123 | 'start_char': 148}, 124 | 'event_mode': [], 125 | 'event_text': 'Delegates of the Venezuelan president, Nicolas Maduro, and ' 126 | 'the leader objector Juan Guaidó resumed on Wednesday (31) ' 127 | 'conversations on the island of Barbados, sponsored by ' 128 | 'Norway, to seek a way out of the crisis in their country, ' 129 | 'announced the parties. "We started another round of ' 130 | 'sanctions under the mechanism of Oslo," indicated on ' 131 | 'Twitter Mr Stalin González, one of the envoys of Guaidó, ' 132 | 'parliamentary leader recognized as interim president by ' 133 | 'half hundred countries. The vice-president of Venezuela, ' 134 | 'Delcy Rodríguez, confirmed in a press conference that ' 135 | 'representatives of mature traveled to Barbados for the ' 136 | 'meetings with the opposition. Mature reaffirmed in a ' 137 | 'message to the nation that the government seeks to ' 138 | 'establish a "bureau for permanent dialog with the ' 139 | 'opposition, and called entrepreneurs and social movements ' 140 | 'to be added to the process. After exploratory ' 141 | 'approximations and a first face to face in Oslo in mid-May, ' 142 | 'the parties have transferred the dialog on 8 July for the ' 143 | 'caribbean island. The opposition search in the negotiations ' 144 | 'the output of mature and a new election, by considering ' 145 | 'that his second term, started last January, resulted from ' 146 | 'fraudulent elections, not recognized by almost 60 ' 147 | 'countries, among them the United States. ', 148 | 'event_type': 'RETREAT', 149 | 'geolocated_ents': [ { 'admin1_code': '00', 150 | 'admin1_name': '', 151 | 'admin2_code': '', 152 | 'admin2_name': '', 153 | 'country_code3': 'BRB', 154 | 'end_char': 156, 155 | 'event_location_overlap_score': 1.0, 156 | 'feature_class': 'A', 157 | 'feature_code': 'PCLI', 158 | 'geonameid': '3374084', 159 | 'lat': 13.16453, 160 | 'lon': -59.55165, 161 | 'resolved_placename': 'Barbados', 162 | 'score': 1.0, 163 | 'search_placename': 'Barbados', 164 | 'start_char': 148}, 165 | { 'admin1_code': '00', 166 | 'admin1_name': '', 167 | 'admin2_code': '', 168 | 'admin2_name': '', 169 | 'country_code3': 'NOR', 170 | 'end_char': 177, 171 | 'feature_class': 'A', 172 | 'feature_code': 'PCLI', 173 | 'geonameid': '3144096', 174 | 'lat': 62.0, 175 | 'lon': 10.0, 176 | 'resolved_placename': 'Kingdom of Norway', 177 | 'score': 1.0, 178 | 'search_placename': 'Norway', 179 | 'start_char': 171}, 180 | { 'admin1_code': '12', 181 | 'admin1_name': 'Oslo', 182 | 'admin2_code': '0301', 183 | 'admin2_name': 'Oslo', 184 | 'country_code3': 'NOR', 185 | 'end_char': 318, 186 | 'feature_class': 'P', 187 | 'feature_code': 'PPLC', 188 | 'geonameid': '3143244', 189 | 'lat': 59.91273, 190 | 'lon': 10.74609, 191 | 'resolved_placename': 'Oslo', 192 | 'score': 1.0, 193 | 'search_placename': 'Oslo', 194 | 'start_char': 314}, 195 | { 'admin1_code': '00', 196 | 'admin1_name': '', 197 | 'admin2_code': '', 198 | 'admin2_name': '', 199 | 'country_code3': 'VEN', 200 | 'end_char': 502, 201 | 'feature_class': 'A', 202 | 'feature_code': 'PCLI', 203 | 'geonameid': '3625428', 204 | 'lat': 8.0, 205 | 'lon': -66.0, 206 | 'resolved_placename': 'Bolivarian Republic of ' 207 | 'Venezuela', 208 | 'score': 1.0, 209 | 'search_placename': 'Venezuela', 210 | 'start_char': 493}, 211 | { 'admin1_code': '00', 212 | 'admin1_name': '', 213 | 'admin2_code': '', 214 | 'admin2_name': '', 215 | 'country_code3': 'BRB', 216 | 'end_char': 604, 217 | 'feature_class': 'A', 218 | 'feature_code': 'PCLI', 219 | 'geonameid': '3374084', 220 | 'lat': 13.16453, 221 | 'lon': -59.55165, 222 | 'resolved_placename': 'Barbados', 223 | 'score': 1.0, 224 | 'search_placename': 'Barbados', 225 | 'start_char': 596}, 226 | { 'admin1_code': '12', 227 | 'admin1_name': 'Oslo', 228 | 'admin2_code': '0301', 229 | 'admin2_name': 'Oslo', 230 | 'country_code3': 'NOR', 231 | 'end_char': 918, 232 | 'feature_class': 'P', 233 | 'feature_code': 'PPLC', 234 | 'geonameid': '3143244', 235 | 'lat': 59.91273, 236 | 'lon': 10.74609, 237 | 'resolved_placename': 'Oslo', 238 | 'score': 1.0, 239 | 'search_placename': 'Oslo', 240 | 'start_char': 914}, 241 | { 'admin1_code': '00', 242 | 'admin1_name': '', 243 | 'admin2_code': '', 244 | 'admin2_name': '', 245 | 'country_code3': 'USA', 246 | 'end_char': 1259, 247 | 'feature_class': 'A', 248 | 'feature_code': 'PCLI', 249 | 'geonameid': '6252001', 250 | 'lat': 39.76, 251 | 'lon': -98.5, 252 | 'resolved_placename': 'United States', 253 | 'score': 1.0, 254 | 'search_placename': 'United States', 255 | 'start_char': 1239}], 256 | 'headline': 'Governo e oposição da Venezuela retomam diálogo em Barbados\n', 257 | 'id': '20190801-2309-4e081644904c_COOPERATE_R', 258 | 'pub_date': '2019-08-01', 259 | 'publisher': 'translateme2-pt', 260 | 'story_id': 'AFPPT00020190801ef81000jh:50066619', 261 | 'story_people': ['Delcy Rodríguez', 'Guaidó', 'Nicolas Maduro', 'Stalin González', 'Juan Guaidó'], 262 | 'story_orgs': ['Mature'], 263 | 'story_locs': ['Norway', 'United States', 'Barbados', 'Oslo', 'Venezuela'], 264 | 'version': 'NGEC_coder-Vers001-b1-Run-001'} 265 | """ 266 | 267 | def find_event_loc(self, event, geo_overlap_thresh=0.5): 268 | if 'LOC' not in event['attributes'].keys(): 269 | event['event_geolocation'] = {"reason": "No LOC attribute found by the QA/attribute model", 270 | "geo": None} 271 | return event 272 | try: 273 | event_loc_raw = event['attributes']['LOC'][0] ## NOTE!! Assuming just one location from the QA model 274 | except IndexError: 275 | event['event_geolocation'] = {"reason": "No LOC attribute found by the QA/attribute model", 276 | "geo": None} 277 | return event 278 | if not event_loc_raw: 279 | event['event_geolocation'] = {"reason": "No LOC attribute found by the QA/attribute model", 280 | "geo": None} 281 | return event 282 | if 'geolocated_ents' not in event.keys(): 283 | event['event_geolocation'] = {"reason": "No story locations were geolocated (Missing 'geolocated_ents' key).", 284 | "geo": None} 285 | return event 286 | event_loc_chars = set(range(event_loc_raw['qa_start_char'], event_loc_raw['qa_end_char'])) 287 | geo_ent_ranges = [set(range(i['start_char'], i['end_char'])) for i in event['geolocated_ents']] 288 | # calculate intersection-over-union/Jaccard 289 | ious = np.array([len(event_loc_chars.intersection(i)) / len(event_loc_chars.union(i)) for i in geo_ent_ranges]) 290 | if len(ious) == 0: 291 | event['event_geolocation'] = {"reason": f"No geolocated entities", 292 | "geo": None} 293 | return event 294 | try: 295 | if np.max(ious) < geo_overlap_thresh: 296 | event['event_geolocation'] = {"reason": f"Attribute placename ({event_loc_raw['text']}) [doesn't overlap enough with any placenames: {str(np.max(ious))}", 297 | "geo": None} 298 | return event 299 | except ValueError: 300 | event['event_geolocation'] = {"reason": f"Problem with intersection-overlap vector. No elements?", 301 | "geo": None} 302 | return event 303 | best_match = event['geolocated_ents'][np.argmax(ious)] 304 | if not best_match: 305 | event['event_geolocation'] = {"reason": f"No 'best_match' geolocated entity", 306 | "geo": None} 307 | return event 308 | best_match['event_location_overlap_score'] = float(np.max(ious)) 309 | if 'score' not in best_match.keys(): 310 | event['event_geolocation'] = {"reason": f"'best_match' identified but no 'score' key. Returning best_match anyway", 311 | "geo": best_match} 312 | return event 313 | if best_match['score'] > self.geo_threshold: 314 | event['event_geolocation'] = {"reason": f": Successful overlap between attribute placename and one of the geoparser results", 315 | "geo": best_match} 316 | return event 317 | else: 318 | event['event_geolocation'] = {"reason": f": Successful overlap between attribute placename and one of the geoparser results BUT geoparser score was too low ({best_match['score']})", 319 | "geo": None} 320 | return event 321 | 322 | 323 | 324 | 325 | def add_meta(self, event): 326 | """ 327 | Add optional metadata to the event dictionary (e.g. alternative country codes, country names, 328 | event intensity, event quad class, etc.) 329 | """ 330 | for k, att in event['attributes'].items(): 331 | # add stuff to actors and recipients 332 | if k in ["LOC", "DATE"]: 333 | continue 334 | for v in att: 335 | try: 336 | v['country_name'] = self.iso_to_name[v['country']] 337 | except: 338 | print(v['country']) 339 | v['country_name'] = "" 340 | 341 | return event 342 | 343 | 344 | def process(self, event_list, return_raw=False): 345 | """ 346 | Create and write out a final cleaned dictionary/JSON file of events. 347 | 348 | Parameters 349 | ---------- 350 | event_list: list of dicts 351 | list of events after being passed through each of the processing steps 352 | return_raw: bool 353 | If true, don't write to a final and instead return the final version. Useful for 354 | debugging. Defaults to False. 355 | """ 356 | for n, event in enumerate(event_list): 357 | #if n == 0: 358 | # print(e) 359 | event = self.find_event_loc(event) 360 | event = self.add_meta(event) 361 | try: 362 | event = resolve_date(event) 363 | except Exception as exception: 364 | logger.warning(f"{exception} parsing date for event number {n}") 365 | if return_raw: 366 | return event_list 367 | else: 368 | with jsonlines.open("events_processed.jsonl", "w") as f: 369 | f.write_all(event_list) 370 | 371 | -------------------------------------------------------------------------------- /NGEC/geolocation.py: -------------------------------------------------------------------------------- 1 | from mordecai3 import Geoparser 2 | from rich.progress import track 3 | from rich import print 4 | import time 5 | import jsonlines 6 | import pandas as pd 7 | import os 8 | 9 | import logging 10 | logger = logging.getLogger(__name__) 11 | logger.addHandler(logging.NullHandler()) 12 | 13 | def country_name_dict(base_path): 14 | file = os.path.join(base_path, "countries.csv") 15 | countries = pd.read_csv(file) 16 | country_name_dict = {i:j for i, j in zip(countries['CCA3'], countries['Name'])} 17 | country_name_dict.update({"": ""}) 18 | country_name_dict.update({"IGO": "Intergovernmental Organization"}) 19 | return country_name_dict 20 | 21 | 22 | class GeolocationModel: 23 | def __init__(self, 24 | geo_model="/Users/ahalterman/MIT/Geolocation/mordecai3_scratch/mordecai3/mordecai_new.pt", 25 | nlp=None, 26 | base_path = "NGEC/assets/", 27 | geo_path = "../mordecai3/mordecai3/assets/", 28 | save_intermediate=False, 29 | quiet=False): 30 | self.geo = Geoparser(geo_model, 31 | geo_asset_path=geo_path, 32 | nlp=nlp, 33 | event_geoparse=False, 34 | trim=True, 35 | debug=False) 36 | self.quiet = quiet 37 | self.save_intermediate = save_intermediate 38 | self.iso_to_name = country_name_dict(base_path) 39 | 40 | 41 | def process(self, story_list, doc_list): 42 | """ 43 | Wrap the Mordecai3 geoparser function. 44 | 45 | Parameters 46 | -------- 47 | story_list: list of story dicts. See example 48 | doc_list: list of spaCy docs 49 | 50 | Example 51 | ------ 52 | event = {'id': '20190801-2227-8b13212ac6f6', 53 | 'date': '2019-08-01', 54 | 'event_type': ['SANCTION', 'PROTEST'], 55 | 'event_mode': [], 56 | 'event_text': 'The Liberal Party, the largest opposition in Paraguay, announced in the evening of Wednesday the decision to submit an application of impeachment against the president of the country, Mario Abdo Benítez, and vice-president Hugo Velázquez, by polemical agreement with Brazil on the purchase of energy produced in Itaipu. According to the president of the Liberal Party, Efraín Alegre, the opposition also come tomorrow with penal action against all those involved in the negotiations of the agreement with Brazil, signed on confidentiality in May and criticized for being detrimental to the interests of the country. The Liberal Party has the support of the front Guasú, Senator and former President Fernando Lugo, he himself target of an impeachment, decided in less than 24 hours, in June 2012. According to legend, the reasons for the opening of the proceedings against Abdo Benítez are bad performance of functions, betrayal of the homeland and trafficking of influence. Alegre also announced the convocation of demonstrations throughout the country on Friday. ', 57 | 'story_id': 'EFESP00020190801ef8100001:50066618', 58 | 'publisher': 'translateme2-pt', 59 | 'headline': '\nOposição confirma que pedirá impeachment de presidente do Paraguai; PARAGUAI GOVERNO (Pauta)\n', 60 | 'pub_date': '2019-08-01', 'contexts': ['corruption'], 61 | 'version': 'NGEC_coder-Vers001-b1-Run-001', 62 | 'attributes': {'ACTOR': {'text': 'Mario Abdo Benítez', 'score': 0.1976235955953598}, 63 | 'RECIP': {'text': 'Fernando Lugo', 'score': 0.10433810204267502}, 64 | 'LOC': {'text': 'Paraguay', 'score': 0.24138706922531128}}} 65 | gp.process([event]) 66 | """ 67 | if len(doc_list) != len(story_list): 68 | raise ValueError(f"story_list length does not match spaCy doc list len: {len(story_list)} vs. {len(doc_list)}.") 69 | 70 | for n, story in track(enumerate(story_list), total=len(story_list), description="Geoparsing stories..."): 71 | doc = doc_list[n] 72 | res = self.geo.geoparse_doc(doc) 73 | for r in res['geolocated_ents']: 74 | try: 75 | r['country_name'] = self.iso_to_name[r['country_code3']] 76 | except KeyError: 77 | #logger.warning(f"Missing country code for {r}") 78 | r['country_name'] = None 79 | #if 'placename' not in r.keys(): 80 | # print("'placename' key missing from geolocation results") 81 | # #print(r) 82 | # continue 83 | #r['search_placename'] = r['placename'] 84 | #if 'resolved_placename' not in r.keys() and 'name' in r.keys(): 85 | # r['resolved_placename'] = r['name'] 86 | # del r['name'] 87 | if 'name' in r.keys(): 88 | r['resolved_placename'] = r['name'] 89 | del r['name'] 90 | story['geolocated_ents'] = res['geolocated_ents'] 91 | 92 | 93 | if self.save_intermediate: 94 | fn = time.strftime("%Y_%m_%d-%H") + "_geolocation_output.jsonl" 95 | with jsonlines.open(fn, "w") as f: 96 | f.write_all(story_list) 97 | 98 | return story_list 99 | 100 | 101 | 102 | if __name__ == "__main__": 103 | #import streamlit as st 104 | 105 | #@st.cache(allow_output_mutation=True, suppress_st_warning=True) 106 | #def make_ag(): 107 | # ag = ActorResolver() 108 | # return ag 109 | 110 | #ag = make_ag() 111 | 112 | #query_text = st.text_input("Enter an actor string") 113 | #query_date = st.text_input("Enter a date", "today") 114 | 115 | #best = ag.agent_to_code(query_text, query_date) 116 | #st.write(best) 117 | import jsonlines 118 | 119 | ag = ActorResolver() 120 | with jsonlines.open("PLOVER_coding_201908_with_attr.jsonl", "r") as f: 121 | data = list(f.iter()) 122 | 123 | out = ag.process(data) 124 | with jsonlines.open("PLOVER_coding_201908_with_actor.jsonl", "w") as f: 125 | f.write_all(out) 126 | -------------------------------------------------------------------------------- /NGEC/mode_class.py: -------------------------------------------------------------------------------- 1 | import sklearn 2 | 3 | def _load_model(model_dir): 4 | """ 5 | Load the mode classification models. 6 | 7 | Parameters 8 | ---------- 9 | model_dir: Path 10 | path to the mode classification models 11 | 12 | Returns 13 | ------ 14 | model_dict: dict 15 | With event classes as keys and models as values. 16 | """ 17 | raise NotImplementedError() 18 | 19 | 20 | class ModeClass: 21 | def __init__(self, 22 | model_dir="assets/mode_class_models/", 23 | threshold=0.6 # we can set stuff like this here 24 | ): 25 | 26 | self.model_dict = _load_model(model_dir) 27 | self.threshold = threshold 28 | 29 | 30 | def process(self, story_list): 31 | """ 32 | Process a list of stories to detect the event class. 33 | 34 | Example 35 | ------- 36 | The input is a list of dictionaries, each with an 'event_text' key with the full text of the story 37 | and a 'event_type' key with a list of detected event types, e.g. ['SANCTION', 'MOBILIZE'] 38 | 39 | {'date': '2019-08-01', 40 | 'event_type': ['SANCTION', 'MOBILIZE'], 41 | 'event_text': 'Indonesia is investigating a report that ... ', 42 | 'headline': 'Indonesia says it is probing a report of a ...', 43 | 'id': '', 44 | 'pub_date': '2019-08-01', 45 | 'publisher': '', 46 | 'story_id': '', 47 | 'version': ''} 48 | 49 | Parameters 50 | ---------- 51 | story_list: list of dicts 52 | Each dictionary must have an 'event_text' key with the full text of the story and 53 | an 'event_type' key with a list of detected event types. 54 | 55 | Returns 56 | ------- 57 | story_list: list of dicts 58 | Each story dictionary now contains an "event_mode" key with a list of detected modes (str). E.g.: 59 | 'event_mode': ['SANCTION-withdraw'] 60 | 61 | 62 | """ 63 | raise NotImplementedError() 64 | 65 | for story in stories: 66 | if event_text not in story.keys(): 67 | raise ValueError("No 'event_text' key in input.") 68 | if event_type not in story.keys(): 69 | raise ValueError("Must have detected event types in input.") -------------------------------------------------------------------------------- /NGEC/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__init__.py -------------------------------------------------------------------------------- /NGEC/tests/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /NGEC/tests/__pycache__/conftest.cpython-39-pytest-7.0.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__pycache__/conftest.cpython-39-pytest-7.0.1.pyc -------------------------------------------------------------------------------- /NGEC/tests/__pycache__/test_actor_resolution.cpython-39-pytest-7.0.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__pycache__/test_actor_resolution.cpython-39-pytest-7.0.1.pyc -------------------------------------------------------------------------------- /NGEC/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from ..actor_resolution import ActorResolver 2 | from ..formatter import Formatter 3 | import pytest 4 | import spacy 5 | from NGEC import AttributeModel 6 | 7 | @pytest.fixture(scope='session', autouse=True) 8 | def ag(): 9 | return ActorResolver(base_path="./assets/") 10 | 11 | @pytest.fixture(scope='session', autouse=True) 12 | def nlp(): 13 | return spacy.load("en_core_web_trf") 14 | 15 | @pytest.fixture(scope='session', autouse=True) 16 | def am(): 17 | return AttributeModel(model_dir = "./assets/PROP-SQuAD-trained-tinybert-6l-768d-squad2220302-1457", 18 | expand_actors=True, 19 | silent=False) 20 | 21 | -------------------------------------------------------------------------------- /NGEC/tests/test_attribute_model.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import datetime 3 | 4 | def test_nat1(am): 5 | pass -------------------------------------------------------------------------------- /NGEC/tests/test_formatter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from ..formatter import resolve_date 3 | 4 | def test_resolution(): 5 | event = {"pub_date": "June 20, 2012", 6 | "attributes": {"DATE": [{"text": "last Sunday"}]}} 7 | resolve_date(event) -------------------------------------------------------------------------------- /NGEC/tests/test_multiple_actors.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | def make_example(text, actor_phrase, nlp): 5 | doc = nlp(text) 6 | match = re.search(actor_phrase, text) 7 | qa = {'text': actor_phrase, 8 | 'qa_score': 0.4408265948295593, 9 | 'qa_start_char': match.span()[0], 10 | 'qa_end_char': match.span()[1]} 11 | return doc, qa 12 | 13 | def test_1(am, nlp): 14 | # simple split, both actors present in answer span 15 | text = "Ukrainian forces carried out airstrikes against Russians and Belorussians" 16 | actor_phrase = "Russians and Belorussians" 17 | doc, qa = make_example(text, actor_phrase, nlp) 18 | actors = am.find_co_actors(qa, doc) 19 | assert set([i['text'] for i in actors]) == set(["Russians", "Belorussians"]) 20 | 21 | def test_2(am, nlp): 22 | # amod split, both actors present in answer span 23 | text = "Ukrainian forces carried out airstrikes against Russian and Belorussian soldiers" 24 | actor_phrase = "Russian and Belorussian soldiers" 25 | doc, qa = make_example(text, actor_phrase, nlp) 26 | actors = am.find_co_actors(qa, doc) 27 | assert set([i['text'] for i in actors]) == set(["Belorussian soldiers", "Russian soldiers"]) 28 | 29 | def test_3(am, nlp): 30 | # simple split, only one actor present in answer span 31 | text = "Ukrainian forces carried out airstrikes against Russians and Belorussians" 32 | actor_phrase = "Russians" 33 | doc, qa = make_example(text, actor_phrase, nlp) 34 | actors = am.find_co_actors(qa, doc) 35 | assert set([i['text'] for i in actors]) == set(["Russians", "Belorussians"]) 36 | 37 | def test_3(am, nlp): 38 | # simple split, only the second actor present in answer span 39 | text = "Ukrainian forces carried out airstrikes against Russians and Belorussians" 40 | actor_phrase = "Belorussians" 41 | doc, qa = make_example(text, actor_phrase, nlp) 42 | actors = am.find_co_actors(qa, doc) 43 | assert set([i['text'] for i in actors]) == set(["Belorussians", "Russians"]) 44 | 45 | def test_4(am, nlp): 46 | # amod split, only the second actor present in answer span 47 | text = "Ukrainian forces carried out airstrikes against Russian and Belorussian soldiers" 48 | actor_phrase = "Belorussian soldiers" 49 | doc, qa = make_example(text, actor_phrase, nlp) 50 | actors = am.find_co_actors(qa, doc) 51 | assert set([i['text'] for i in actors]) == set(["Russian soldiers", "Belorussian soldiers"]) 52 | 53 | def test_5(am, nlp): 54 | # amod, no second actor present in answer span 55 | text = "Ukrainian forces carried out airstrikes against Russian soldiers" 56 | actor_phrase = "Russian soldiers" 57 | doc, qa = make_example(text, actor_phrase, nlp) 58 | actors = am.find_co_actors(qa, doc) 59 | assert set([i['text'] for i in actors]) == set(["Russian soldiers"]) 60 | 61 | def test_6(am, nlp): 62 | # long list 63 | text = "Japan, the United States, Australia and India got together in New York in September last year for the first time." 64 | actor_phrase = "Japan" 65 | doc, qa = make_example(text, actor_phrase, nlp) 66 | actors = am.find_co_actors(qa, doc) 67 | assert set([i['text'] for i in actors]) == set(['Japan', 'United States', 'Australia', 'India']) 68 | 69 | def test_7(am, nlp): 70 | # two actors, full titles 71 | text = "Russian President Vladimir Putin and British Prime Minister Boris Johnson will meet in Geneva next week." 72 | actor_phrase = "Vladimir Putin" 73 | doc, qa = make_example(text, actor_phrase, nlp) 74 | actors = am.find_co_actors(qa, doc) 75 | assert set([i['text'] for i in actors]) == set(['Russian President Vladimir Putin', 'British Prime Minister Boris Johnson']) 76 | 77 | def test_8(am, nlp): 78 | text = "U.S. national security adviser Robert O'Brien said Friday he will hold talks with his counterparts from Japan, Australia, and India in Hawaii in October." 79 | actor_phrase = "Japan" 80 | doc, qa = make_example(text, actor_phrase, nlp) 81 | actors = am.find_co_actors(qa, doc) 82 | assert set([i['text'] for i in actors]) == set(['Australia', 'Japan', 'India']) 83 | 84 | def test_9(am, nlp): 85 | # Checks that we aren't picking up appostive clauses that aren't compound lists 86 | text = "Og Fernandes, rapporteur of Operation Faroeste, revoked the house arrest of Sandra Inês Rusciolelli, the first judge to sign a plea bargaining agreement in Brazil." 87 | actor_phrase = "Sandra Inês Rusciolelli" 88 | doc, qa = make_example(text, actor_phrase, nlp) 89 | actors = am.find_co_actors(qa, doc) 90 | assert set([i['text'] for i in actors]) == set(["Sandra Inês Rusciolelli"]) 91 | 92 | 93 | def test_10(am, nlp): 94 | text = "According to a statement published on its website, Putin and Johnson discussed climate issues in light of of the forthcoming UN climate change conference COP26 and leaders ' summit in Glasgow." 95 | actor_phrase = "Putin" 96 | doc, qa = make_example(text, actor_phrase, nlp) 97 | actors = am.find_co_actors(qa, doc) 98 | assert set([i['text'] for i in actors]) == set(['Putin', 'Johnson']) 99 | 100 | def test_11(am, nlp): 101 | # Actors follow an introductory clause 102 | text = "According to a statement published on its website, Putin and Johnson discussed climate issues in light of of the forthcoming UN climate change conference COP26 and leaders ' summit in Glasgow." 103 | actor_phrase = "Putin and Johnson" 104 | doc, qa = make_example(text, actor_phrase, nlp) 105 | actors = am.find_co_actors(qa, doc) 106 | assert set([i['text'] for i in actors]) == set(['Putin', 'Johnson']) 107 | 108 | def test_12(am, nlp): 109 | text = "Qasr al-Nil Misdemeanor Court earlier cleared 28 arrested suspects and another 24 fugitives over accusations of protesting without prior permission." 110 | actor_phrase = "arrested suspects" 111 | doc, qa = make_example(text, actor_phrase, nlp) 112 | actors = am.find_co_actors(qa, doc) 113 | 114 | def test_13(am, nlp): 115 | text = "Last month Russia and Turkish Foreign Minister Mevlut Cavusoglu both accused Iran of trying to destabilise Syria and Iraq and of sectarianism, prompting Tehran to summon Ankara's ambassador." 116 | actor_phrase = "Russia" 117 | doc, qa = make_example(text, actor_phrase, nlp) 118 | actors = am.find_co_actors(qa, doc) 119 | assert set([i['text'] for i in actors]) == set(['Russia', 'Turkish Foreign Minister Mevlut Cavusoglu']) 120 | 121 | def test_14(am, nlp): 122 | text = "\"Dine Ak Diamono\" talk show hosted by Moustapha Diop, with Bassirou Ngom, lawyer and member of the Alliance for the Republic; Barrister Babacar Ba, leader of the civil society organization known as \"Forum du Justiciable;\" Alassane Kitane, teacher of Philosophy at Amary Ndack Seck High School in Thies; and Oumar Faye of the movement Leeral Askanwi [Enlightening People], as guests - live from studio [Diop] Good evening viewers and thank you for your fidelity to the \"Dine Ak Diamono\" talk show." 123 | 124 | def test_15(am, nlp): 125 | text = "The Liberal Party, the largest opposition in Paraguay, announced in the evening of Wednesday the decision to submit an application of impeachment against the president of the country, Mario Abdo Benítez, and vice-president Hugo Velázquez, by polemical agreement with Brazil on the purchase of energy produced in Itaipu." 126 | actor_phrase = "Velázquez" 127 | doc, qa = make_example(text, actor_phrase, nlp) 128 | actors = am.find_co_actors(qa, doc) 129 | 130 | def test_16(am, nlp): 131 | text = "The Liberal Party, the largest opposition in Paraguay, announced in the evening of Wednesday the decision to submit an application of impeachment against the president of the country, Mario Abdo Benítez, and vice Hugo Velázquez, by polemical agreement with Brazil on the purchase of energy produced in Itaipu." 132 | actor_phrase = "president" 133 | doc, qa = make_example(text, actor_phrase, nlp) 134 | am.expand_actor(qa, doc) 135 | 136 | def test_17(am, nlp): 137 | # Actors follow an introductory clause 138 | text = "The leaders of Germany, France, and the UK met in light of the forthcoming UN climate change conference COP26 and leaders ' summit in Glasgow." 139 | actor_phrase = "UK" 140 | doc, qa = make_example(text, actor_phrase, nlp) 141 | actors = am.find_co_actors(qa, doc) 142 | assert set([i['text'] for i in actors]) == set(['Germany', 'France', 'UK']) 143 | -------------------------------------------------------------------------------- /NGEC/utilities.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from spacy.tokens import Token 3 | from spacy.language import Language 4 | import numpy as np 5 | import re 6 | 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | logger.addHandler(logging.NullHandler()) 10 | 11 | 12 | def spacy_doc_setup(): 13 | try: 14 | Token.set_extension('tensor', default=False) 15 | except ValueError: 16 | pass 17 | try: 18 | @Language.component("token_tensors") 19 | def token_tensors(doc): 20 | tensors = doc._.trf_data.last_hidden_layer_state 21 | for n, d in enumerate(doc): 22 | if tensors[n]: 23 | d._.set('tensor', tensors[n]) 24 | else: 25 | d._.set('tensor', np.zeros(tensors[0].shape[-1])) 26 | return doc 27 | except ValueError: 28 | pass 29 | 30 | ### TESTING ### 31 | ### Comment this out and run to verify that the new 3.7+ version of spaCy works 32 | #import spacy 33 | #nlp = spacy.load("en_core_web_trf") 34 | #spacy_doc_setup() 35 | #nlp.add_pipe("token_tensors") 36 | # 37 | #doc = nlp("We visited Berlin and Alexanderplatz.") 38 | #doc[3]._.tensor 39 | #### 40 | 41 | def stories_to_events(story_list, doc_list=None): 42 | if not doc_list: 43 | logger.warning("Missing doc list...") 44 | if doc_list: 45 | if len(doc_list) != len(story_list): 46 | raise ValueError("the story list and list of spaCy docs must be the same length") 47 | for n, story in enumerate(story_list): 48 | doc = doc_list[n] 49 | story['story_people'] = list(set([i.text for i in doc.ents if i.label_ == "PERSON"])) 50 | story['story_organizations'] = list(set([i.text for i in doc.ents if i.label_ == "ORG"])) 51 | story['story_places'] = list(set([i.text for i in doc.ents if i.label_ in ["GPE", "LOC", "FAC"]])) 52 | story['_doc_position'] = n 53 | # "lengthen" the story-level data to generate a separate element 54 | # for each event type 55 | event_list = [] 56 | for n, ex in enumerate(story_list): 57 | # event modes are formatted ["ACCUSE-disapprove", "ACCUSE-allege", "CONSULT-third-party"] 58 | modes = [i.split("-") for i in ex['event_mode']] 59 | events_with_modes = list(set([i[0] if i else None for i in modes])) 60 | for event_type in ex['event_type']: 61 | if event_type not in events_with_modes: 62 | event_mode = "" 63 | d = ex.copy() # note: the copy is important! 64 | d['event_type'] = event_type 65 | d['orig_id'] = d['id'] 66 | d['event_mode'] = event_mode 67 | d['id'] = d['id'] + "_" + event_type + "_" # generate a new ID 68 | event_list.append(d) 69 | else: 70 | for et, *event_mode in modes: 71 | # annoyingly, the event and mode are separated by a hyphen, but 72 | # there are also hyphens within certain mode names. Merge those back 73 | # together 74 | event_mode = '-'.join([*event_mode]) 75 | if et != event_type: 76 | # skip modes that are attached to the wrong event type 77 | continue 78 | d = ex.copy() # note: the copy is important! 79 | d['event_type'] = event_type 80 | d['orig_id'] = d['id'] 81 | d['event_mode'] = event_mode 82 | d['id'] = d['id'] + "_" + event_type + "_" + event_mode # generate a new ID 83 | event_list.append(d) 84 | return event_list -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NGEC -- Next generation political event coder 2 | 3 | This repository contains the code for the Next Generation Event Coder (NGEC), a 4 | Python library for extracting event data from news text. The pipeline works out-of-the-box 5 | to code events using the [PLOVER event ontology](https://osf.io/preprints/socarxiv/rm5dw/), but can 6 | be easily customized to produce events with a custom ontology. 7 | 8 | It accompanies the working paper, ["Creating Custom Event Data Without Dictionaries: A Bag-of-Tricks"](https://arxiv.org/pdf/2304.01331.pdf). 9 | 10 | ## Overview 11 | 12 | We break the problem of event extraction into six steps: 13 | 14 | 1. Event classification: identify the event described in a document (e.g., PROTEST, ASSAULT, AGREE,...) using a transformer classifier trained on new data. 15 | 2. Sub-event (``mode'') classification: identify a more specific event type (e.g., PROTEST-riot, ASSAULT-aerial), also using a transformer-based classifier. 16 | 3. Context classification: identify themes or topics in a document (e.g., "human rights", "environment") using a classifier. 17 | 4. Event attribute identification: identifying the spans of text that report who carried out the event, who it was directed against, where it occurred, etc. We do this with a fine-tuned question-answering model trained on newly annotated text. 18 | 5. Actor, location, and date resolution: we resolve extracted named actors and recipients to their Wikipedia page using an offline Wikipedia index and a custom neural similarity model. 19 | 6. Entity categorization: Finally, we map the actor to their country and their "sector" code as defined by the PLOVER ontology (e.g., "GOV", "MIL", etc.) 20 | 21 | ![](docs/pipeline_figure.png) 22 | 23 | Currently, this processing pipeline only performs the following steps: 24 | 25 | *Note*: This repo has basic pretrained models for event detection, but does *not* 26 | currently include context and mode models. 27 | 28 | ## Running 29 | 30 | The main script is `ngec_process.py`. 31 | 32 | ``` 33 | python ngec_process.py 34 | 35 | usage: ngec_process.py [-h] [-m -1] [-a NGEC/assets/PROP-SQuAD-trained-tinybert-6l-768d-squad2220302-1457] [-b NGEC/assets/] 36 | [-g ../mordecai3/mordecai_new.pt] 37 | [input_file] 38 | 39 | positional arguments: 40 | input_file JSONL input file. At a minimum, this should have keys for "id", "date", and 41 | "event_text" 42 | 43 | options: 44 | -h, --help show this help message and exit 45 | -m, --max-stories -1 46 | Max stories to code. -1 is all stories 47 | -a, --attribute-dir NGEC/assets/PROP-SQuAD-trained-tinybert-6l-768d-squad2220302-1457 48 | Location of the QA attribute model 49 | -b, --base-path NGEC/assets/ 50 | Location of the other models and files 51 | -g, --geo-model ../mordecai3/mordecai_new.pt 52 | Location of the geolocation model 53 | ``` 54 | 55 | 56 |
57 | Click to view example input 58 | 59 | ``` 60 | { 61 | "id": "20190801-2227-8b13212ac6f6", 62 | "date": "2019-08-01", 63 | "event_type": [ 64 | "ACCUSE", 65 | "REJECT", 66 | "THREATEN", 67 | "SANCTION" 68 | ], 69 | "event_mode": [], 70 | "event_text": "The Liberal Party, the largest opposition in Paraguay, .... ", 71 | "story_id": "EFESP00020190801ef8100001:50066618", 72 | "publisher": "translateme2-pt", 73 | "headline": "\nOposição confirma q...", 74 | "pub_date": "2019-08-01", 75 | "contexts": [ 76 | "corruption" 77 | ], 78 | "version": "NGEC_coder-Vers001-b1-Run-001" 79 | } 80 | ``` 81 |
82 | 83 | 84 | ## Quick start 85 | 86 | First, create a new Conda environment and install the required libraries: 87 | 88 | ``` 89 | conda create -y --name ngec python=3.10 90 | conda activate ngec 91 | 92 | pip install spacy, textacy sentence-transformers 93 | python -m spacy download en_core_web_trf 94 | pip install elasticsearch elasticsearch_dsl unidecode dateparser 95 | pip install jsonlines tqdm datasets rich plac 96 | pip install mordecai3 97 | ``` 98 | 99 | Next, set up an Elasticsearch server with an offline Wikipedia and Geonames. 100 | Download the pre-built index and start an Elasticsearch instance with the pre-built 101 | index (the code below assumes you have Docker installed). 102 | 103 | ``` 104 | # Download a pre-built index from my website: 105 | wget https://andrewhalterman.com/files/geonames_wiki_index_2023-03-02.tar.gz 106 | # uncompress it to produce a directory called `geonames_index` (note that this includes both geonames *and* Wiki) 107 | tar -xvzf geonames_wiki_index_2023-03-02.tar.gz 108 | # You may need to set write permissions for Docker to run 109 | # chmod -R 777 ./geonames_index/ 110 | # Then start an Elasticsearch instance in Docker with the uncompressed index as a volume. 111 | # Later versions of Elasticsearch have not been tested. 112 | sudo docker run -d -p 127.0.0.1:9200:9200 -e "discovery.type=single-node" -v ./geonames_index/:/usr/share/elasticsearch/data elasticsearch:7.10.1 113 | ``` 114 | 115 | If you want to build these indices from scratch, see the detailed instructions for [creating an offline Wikipedia index](https://github.com/ahalterman/NGEC/tree/main/setup/wiki) and [setting up offline Geonames in Elasticsearch](https://github.com/openeventdata/es-geonames). 116 | 117 | ## Note on the models 118 | 119 | Because of conditions imposed by our funder and the proprietary data used in the project, we cannot share the training data or the trained event, mode, and context models used to produce the POLECAT event dataset. However, we provide example code for training classifiers on your own data in the [setup](https://github.com/ahalterman/NGEC/tree/main/setup/train_classifiers) directory. We also provide demonstration pretrained models for the event categories used in the POLECAT dataset that draw on a corpus of pseudo-labeled synthetic news stories using an approach described in [Halterman (2023)](https://arxiv.org/abs/2303.16028). These classifiers are not as accurate as the ones used in the POLECAT dataset, but work pretty well and could easily be improved with additional training data. 120 | 121 | ## Citing 122 | 123 | The steps that this pipeline implements are described in more detail in the [paper](https://arxiv.org/pdf/2304.01331.pdf). If you use the pipeline or the techniques we introduce, please cite the following: 124 | 125 | ``` 126 | @article{halterman_et_al2023creating, 127 | title={Creating Custom Event Data Without Dictionaries: A Bag-of-Tricks}, 128 | author={Andrew Halterman and Philip A. Schrodt and Andreas Beger and Benjamin E. Bagozzi and Grace I. Scarborough}, 129 | journal={arXiv preprint arXiv:2304.01331}, 130 | year={2023} 131 | } 132 | ``` 133 | 134 | ## Acknowledgements 135 | 136 | This research was sponsored by the Political Instability Task Force (PITF). The PITF is funded by 137 | the Central Intelligence Agency. The views expressed in this paper are the authors’ alone and do not 138 | represent the views of the U.S. Government. -------------------------------------------------------------------------------- /examples/Guardian_SDF_sample.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/examples/Guardian_SDF_sample.csv.zip -------------------------------------------------------------------------------- /examples/NGEC_pres.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/examples/NGEC_pres.pdf -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | This directory contains some examples for working with the component parts of the NGEC pipeline. 4 | 5 | Currently, it includes: 6 | 7 | - `demo_wiki_resolution.py`: working with Wikipedia lookups and actor coding 8 | - `demo_mordecai.py`: a simple demo showing the functionality of the geoparser 9 | 10 | For information on training your own event, mode, and context models, see the `setup` directory. -------------------------------------------------------------------------------- /examples/demo_mordecai.py: -------------------------------------------------------------------------------- 1 | # Make sure you've installed Mordecai, which is in a separate package. 2 | # E.g, pip install mordecai3 3 | from mordecai3 import Geoparser 4 | from pprint import pprint 5 | 6 | # Create the Geoparser object 7 | # Make sure the path to the model is correct 8 | geo = Geoparser("NGEC/assets/mordecai_2023-03-28.pt") 9 | 10 | output = geo.geoparse_doc("The Mexican government sent 300 National Guard troopers to bolster the southern state of Guerrero on Tuesday, where a local police chief and 12 officers were shot dead in a brutal ambush the day before.") 11 | 12 | pprint(output) 13 | #{'doc_text': 'The Mexican government sent 300 National Guard troopers to ' 14 | # 'bolster the southern state of Guerrero on Tuesday, where a local ' 15 | # 'police chief and 12 officers were shot dead in a brutal ambush ' 16 | # 'the day before.', 17 | # 'event_location_raw': '', 18 | # 'geolocated_ents': [{'admin1_code': '12', 19 | # 'admin1_name': 'Guerrero', 20 | # 'admin2_code': '', 21 | # 'admin2_name': '', 22 | # 'city_id': '', 23 | # 'city_name': '', 24 | # 'country_code3': 'MEX', 25 | # 'end_char': 97, 26 | # 'feature_class': 'A', 27 | # 'feature_code': 'ADM1', 28 | # 'geonameid': '3527213', 29 | # 'lat': 17.66667, 30 | # 'lon': -100.0, 31 | # 'name': 'Estado de Guerrero', 32 | # 'score': 1.0, 33 | # 'search_name': 'Guerrero', 34 | # 'start_char': 89}]} 35 | -------------------------------------------------------------------------------- /examples/demo_wiki_resolution.py: -------------------------------------------------------------------------------- 1 | import jsonlines 2 | import spacy 3 | from NGEC import ActorResolver 4 | from tqdm import tqdm 5 | from pprint import pprint 6 | import pandas as pd 7 | 8 | # NOTE: Make sure you have NGEC installed. 9 | # From the main NGEC repo, install the requirements then run `pip install -e .` 10 | # Also make sure you have the offline Wikipedia index installed. 11 | # See https://github.com/ahalterman/NGEC#quick-start 12 | 13 | # Change the logging levels--Elasticsearch is very verbose 14 | import logging 15 | logging.getLogger("NGEC.actor_resolution").setLevel(logging.WARNING) 16 | 17 | es_logger = logging.getLogger('elasticsearch') 18 | es_logger.setLevel(logging.WARNING) 19 | 20 | # Load the spaCy model we'll use for named entity recognition 21 | nlp = spacy.load("en_core_web_sm") 22 | 23 | # Load the sample data 24 | data = pd.read_csv("Guardian_SDF_sample.csv.zip", compression='zip') 25 | 26 | # Instantiate the model. 27 | # This assumes that you're running the code from the NGEC/docs directory. 28 | actor_resolution_model = ActorResolver(spacy_model=nlp, 29 | base_path="../NGEC/assets/", 30 | save_intermediate=False, 31 | gpu=False) # Set to True if you have a GPU 32 | 33 | # Run spaCy over the docs 34 | docs = list(nlp.pipe([i['text'] for i in data])) 35 | 36 | # iterate through the docs, making a list of lists with the PERSON and ORG entities 37 | entities = [] 38 | for doc in docs: 39 | for ent in doc.ents: 40 | if ent.label_ in ['PERSON', 'ORG']: 41 | ent_text = ent.text 42 | d = {"entity": ent_text, 43 | # make sure to append the sentence text--we'll use this for context 44 | "context": ent.sent.text} 45 | entities.append(d) 46 | 47 | # Now iterate through the extracted entities and resolve them to Wikipedia 48 | wikis = [] 49 | for ent in tqdm(entities): 50 | wiki = actor_resolution_model.query_wiki(ent['entity'], context = ent['context']) 51 | if not wiki: 52 | wiki = {"search_term": ent['entity'], 53 | "title": None} 54 | else: 55 | wiki['search_term'] = ent['entity'] 56 | wikis.append(wiki) 57 | 58 | 59 | # Print out an example 50 results 60 | for i in wikis[400:450]: 61 | try: 62 | short_desc = i['short_desc'] 63 | except KeyError: 64 | short_desc = "None" 65 | print(f"{i['search_term']:<30} ---> {i['title']} ({short_desc})") 66 | 67 | 68 | 69 | ## Example of categorizing actors using their linked Wikipedia pages 70 | 71 | wiki = actor_resolution_model.query_wiki("Ben Rhodes", context = "The former Obama adviser Ben Rhodes said: “We all owe him our gratitude – he literally made us safer.”") 72 | code = actor_resolution_model.wiki_to_code(wiki) 73 | pprint(code) 74 | 75 | wiki = actor_resolution_model.query_wiki("Niloufar Hamedi", context = "The two journalists are Niloufar Hamedi, who broke the news of Amini’s death for wearing her headscarf too loose, and Elaheh Mohammadi, who wrote about Amini’s funeral.") 76 | code = actor_resolution_model.wiki_to_code(wiki) 77 | pprint(code) 78 | 79 | ### The code below lets you explore the output of the model in a little more detail. ### 80 | 81 | ## Print out the full logs--this will give you more detail on how many 82 | ## candidate wikipedia matches there are. 83 | logging.getLogger("NGEC.actor_resolution").setLevel(logging.DEBUG) 84 | 85 | # Experiment with upper/lower case, including/excluding context, etc. 86 | actor_resolution_model.query_wiki("the International Rescue Committee") 87 | actor_resolution_model.query_wiki("Isis") 88 | actor_resolution_model.query_wiki("Isis", context="Fighting continues in Syria with the terrorist group Isis.") 89 | actor_resolution_model.query_wiki("ISIS", context="Fighting continues in Syria with the terrorist group ISIS.") 90 | actor_resolution_model.query_wiki("ISIS") 91 | 92 | 93 | # Example where coding fails without context 94 | sdf = actor_resolution_model.search_wiki("SDF", fuzziness=0) 95 | 96 | 97 | ## Code to explore how the context similarity model works 98 | 99 | from sentence_transformers import SentenceTransformer 100 | from sentence_transformers.util import cos_sim 101 | 102 | def load_trf_model(model_dir='sentence-transformers/paraphrase-MiniLM-L6-v2'): ## Change to offline!! 103 | model = SentenceTransformer(model_dir) 104 | return model 105 | trf = load_trf_model() 106 | 107 | doc = "The SDF (the Kurdish led force raised by Washington to fight Isis) and the United States are sitting on a volcano in north-east Syria, with tens of thousands of foreign fighters and families in cramped detention centres." 108 | encoded = trf.encode(doc) 109 | res = actor_resolution_model.search_wiki("SDF") 110 | 111 | intro_paras = [i['intro_para'][0:200] for i in res] 112 | encoded_intros = trf.encode(intro_paras) 113 | 114 | sims = cos_sim(encoded, encoded_intros)[0] 115 | res[sims.argmax()] -------------------------------------------------------------------------------- /ngec_process.py: -------------------------------------------------------------------------------- 1 | from NGEC import AttributeModel 2 | from NGEC import ActorResolver 3 | from NGEC import GeolocationModel 4 | from NGEC import Formatter 5 | from NGEC import utilities 6 | 7 | import spacy 8 | from tqdm import tqdm 9 | from rich import print 10 | from rich.progress import track 11 | import plac 12 | from pathlib import Path 13 | import re 14 | 15 | import logging 16 | from rich.logging import RichHandler 17 | 18 | logger = logging.getLogger('main') 19 | handler = RichHandler() 20 | #formatter = logging.Formatter( 21 | # '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 22 | #handler.setFormatter(formatter) 23 | logger.addHandler(handler) 24 | logger.setLevel(logging.INFO) 25 | logger.propagate = False 26 | 27 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] 28 | for i in loggers: 29 | if re.search("NGEC\.", i.name): 30 | i.addHandler(handler) 31 | i.setLevel(logging.INFO) 32 | i.propagate = False 33 | if re.search("elasticsearch", i.name): 34 | i.addHandler(handler) 35 | i.setLevel(logging.WARNING) 36 | 37 | #loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict] 38 | #print(loggers) 39 | 40 | # we need to keep the raw tensors for each token 41 | 42 | def load_nlp(): 43 | nlp = spacy.load("en_core_web_trf") 44 | nlp.add_pipe("token_tensors") 45 | return nlp 46 | 47 | 48 | def read_input(input_file="NGEC/PLOVER_coding_201908_220302-1049.jsonl", max_stories=10): 49 | import jsonlines 50 | """ 51 | Read in Factiva stories and return a list of stories for processing 52 | 53 | TODO: 54 | - clean up new lines/whitespace at the beginning and end of headlines and stories 55 | - do Phil's dateline remover here? 56 | 57 | Parameters 58 | ---------- 59 | ????: ???? 60 | Probably from a file, but possibly from a DB 61 | 62 | Returns 63 | ------- 64 | stories: list of dicts 65 | - text 66 | - title 67 | - publication 68 | - date 69 | """ 70 | if max_stories > 0: 71 | logger.info(f"Limiting to the first {max_stories} stories.") 72 | with jsonlines.open(input_file, "r") as f: 73 | data = list(f.iter()) 74 | return data[:max_stories] 75 | 76 | @plac.pos('input_file', "JSONL input file with events, modes, and contexts") 77 | @plac.opt('max_stories', "Max stories to code", type=int) 78 | @plac.opt('attribute_dir', "Location of the QA attribute model", type=str) 79 | @plac.opt('base_path', "Location of the other models and files", type=Path) 80 | @plac.opt('save_intermediate', "Write output of each intermediate step?", type=bool) 81 | @plac.opt('geo_model', "Location of the geolocation model", type=Path) 82 | @plac.opt('gpu', "Set to True if GPU is available", abbrev='d', type=bool) 83 | def ngec(input_file="NGEC/PLOVER_coding_201908_220302-1049.jsonl", 84 | max_stories=-1, 85 | attribute_dir="NGEC/assets/roberta-base-squad2_2022-08-02", 86 | base_path="NGEC/assets/", 87 | save_intermediate=False, 88 | expand_actors=True, 89 | geo_model="../mordecai3/mordecai_2023-02-07_good.pt", 90 | gpu=False): 91 | 92 | utilities.spacy_doc_setup() 93 | nlp = load_nlp() 94 | 95 | # Initialize the processing models/objects 96 | #event_model = EventClassModel() 97 | #context_model = ContextModel() 98 | #mode_model = ModeModel() 99 | logger.info("Loading geolocation model...") 100 | geolocation_model = GeolocationModel(geo_model, 101 | geo_path = "../mordecai3/mordecai3/assets/", 102 | save_intermediate=save_intermediate) 103 | attribute_model = AttributeModel(attribute_dir, 104 | silent=True, 105 | gpu=gpu, 106 | save_intermediate=save_intermediate, 107 | expand_actors=expand_actors, 108 | base_path=base_path) 109 | actor_resolution_model = ActorResolver(spacy_model=nlp, 110 | base_path=base_path, 111 | save_intermediate=save_intermediate, 112 | gpu=gpu) 113 | formatter = Formatter(base_path=base_path) 114 | 115 | # Read in the stories 116 | story_list = read_input(input_file, max_stories) 117 | 118 | just_text = [i['event_text'] for i in story_list] 119 | doc_list = list(track(nlp.pipe(just_text), total=len(just_text), description="nlping docs...")) 120 | 121 | #story_list = event_model.process(story_list) 122 | #story_list = mode_model.process(story_list) 123 | #story_list = context_model.process(story_list) 124 | logger.info("Geolocating events...") 125 | story_list = geolocation_model.process(story_list, doc_list) 126 | 127 | event_list = utilities.stories_to_events(story_list, doc_list) 128 | logger.debug("Post-event split") 129 | logger.debug(f"{event_list[0]}") 130 | #event_list = mode_model(event_list) 131 | 132 | logger.info("Running attribute model...") 133 | event_list = attribute_model.process(event_list, doc_list) 134 | #print(event_list[0]) 135 | logger.info("Running actor resolution model...") 136 | event_list = actor_resolution_model.process(event_list, doc_list) 137 | #print(event_list[0]) 138 | 139 | logger.info("Formatting results...") 140 | cleaned_events = formatter.process(event_list) 141 | logger.info("Completed processing.") 142 | 143 | if __name__ == "__main__": 144 | plac.call(ngec) -------------------------------------------------------------------------------- /ngec_streamlit.py: -------------------------------------------------------------------------------- 1 | from NGEC import EventClass 2 | from NGEC import AttributeModel 3 | from NGEC import ActorResolver 4 | from NGEC import GeolocationModel 5 | from NGEC import Formatter 6 | from NGEC import utilities 7 | 8 | import streamlit as st 9 | 10 | import spacy 11 | import pandas as pd 12 | 13 | # stuff that's just used to allow streamlit cacheing 14 | import preshed 15 | import cymem 16 | import spacy_transformers 17 | import thinc 18 | 19 | st.markdown("## NGEC test interface") 20 | 21 | st.markdown("Put in some story text to see what NGEC produces.") 22 | st.markdown("The event classifier step uses the open source models that are trained on synthetic documents. The accuracy is not as good as the proprietary models used to produce the POLECAT dataset. To manually override the event classification, set the event type (and mode) on the sidebar.") 23 | st.markdown("Intermediate output is also returned but hidden by default.") 24 | 25 | #@st.cache(allow_output_mutation = True) 26 | @st.cache_resource() 27 | def load_nlp(): 28 | utilities.spacy_doc_setup() 29 | nlp = spacy.load("en_core_web_trf") 30 | nlp.add_pipe("token_tensors") 31 | return nlp 32 | 33 | nlp = load_nlp() 34 | 35 | def format_output(cleaned_events): 36 | for event in cleaned_events: 37 | if 'ACTOR' in event['attributes'].keys() and event['attributes']['ACTOR']: 38 | actors = '; '.join([i['text'] for i in event['attributes']['ACTOR']]) 39 | actor_codes = '; '.join([f"{i['country']} {i['code_1']}" for i in event['attributes']['ACTOR']]) 40 | actor_wikis = '; '.join([i['wiki'] for i in event['attributes']['ACTOR']]) 41 | else: 42 | actors = "" 43 | actor_codes = "" 44 | actor_wikis = "" 45 | if 'RECIP' in event['attributes'].keys() and event['attributes']['RECIP']: 46 | recipients = '; '.join([i['text'] for i in event['attributes']['RECIP']]) 47 | recipient_codes = '; '.join([f"{i['country']} {i['code_1']}" for i in event['attributes']['RECIP']]) 48 | recip_wikis = '; '.join([i['wiki'] for i in event['attributes']['RECIP']]) 49 | else: 50 | recipients = "" 51 | recipient_codes = "" 52 | recip_wikis = "" 53 | if event['event_geolocation']['geo']: 54 | resolved_placename = event['event_geolocation']['geo']['resolved_placename'] 55 | adm1 = event['event_geolocation']['geo']['admin1_name'] 56 | country = event['event_geolocation']['geo']['country_name'] 57 | else: 58 | resolved_placename = "" 59 | adm1 = "" 60 | country = "" 61 | #st.success(actors) 62 | d = {"Raw Actors": actors, 63 | "Actor Codes": actor_codes, 64 | "Actor Wikis": actor_wikis, 65 | "Event Type": event['event_type'], 66 | "Event Mode": event['event_mode'], 67 | "Raw Recipients": recipients, 68 | "Recipient Codes": recipient_codes, 69 | "Recipient Wikis": recip_wikis, 70 | "Resolved Placename": resolved_placename, 71 | "Admin1": adm1, 72 | "Country": country, 73 | "Date": event['date_resolved']} 74 | df = pd.DataFrame(d, index=[0]).transpose() 75 | df = df.reset_index() 76 | df.columns = ["Attribute", "Value"] 77 | # disable row numbers 78 | df.index = [""] * len(df) 79 | st.table(df) 80 | 81 | 82 | 83 | save_intermediate=False 84 | attribute_dir="NGEC/assets/deberta_squadnewsqa_2023-05-22" 85 | base_path="./NGEC/assets/" 86 | save_intermediate=False 87 | expand_actors=True 88 | geo_model="/home/andy/projects/mordecai/mordecai3/assets/mordecai_2023-02-07_good.pt" 89 | geo_path="/home/andy/projects/mordecai/mordecai3/assets/" 90 | 91 | gpu=True 92 | 93 | #@st.cache(allow_output_mutation = True) 94 | @st.cache_resource() 95 | def load_event_class(): 96 | event_model = EventClass() 97 | return event_model 98 | 99 | pub_date = st.sidebar.text_input("Publication date", "today") 100 | event_type = st.sidebar.text_input("Event type", "") 101 | event_mode = st.sidebar.text_input("Mode type", "") 102 | show_intermediate = st.sidebar.checkbox("Show intermediate output", False) 103 | event_model = load_event_class() 104 | 105 | #@st.cache(allow_output_mutation = True) 106 | @st.cache_resource() 107 | def load_geo(save_intermediate=save_intermediate): 108 | geolocation_model = GeolocationModel(geo_model, 109 | geo_path=geo_path, 110 | save_intermediate=save_intermediate) 111 | return geolocation_model 112 | 113 | #@st.cache(allow_output_mutation = True) 114 | @st.cache_resource() 115 | def load_attr(attribute_dir=attribute_dir, silent=True, gpu=gpu, save_intermediate=save_intermediate, expand_actors=expand_actors, 116 | base_path=base_path): 117 | attribute_model = AttributeModel(attribute_dir, 118 | silent=silent, 119 | gpu=gpu, 120 | save_intermediate=save_intermediate, 121 | base_path=base_path, 122 | expand_actors=expand_actors) 123 | return attribute_model 124 | 125 | 126 | @st.cache_resource() 127 | def load_resolution(nlp=nlp, base_path=base_path, save_intermediate=save_intermediate, gpu=gpu): 128 | actor_resolution_model = ActorResolver(spacy_model=nlp, base_path=base_path, save_intermediate=save_intermediate, gpu=gpu) 129 | return actor_resolution_model 130 | 131 | @st.cache_resource() 132 | def load_formatter(base_path=base_path): 133 | formatter = Formatter(base_path=base_path) 134 | return formatter 135 | 136 | geolocation_model = load_geo() 137 | attribute_model = load_attr(base_path=base_path) 138 | actor_resolution_model = load_resolution() 139 | formatter = load_formatter() 140 | 141 | text = st.text_area("Input text", "German troops withdrew from their area of operations in Kandahar last week.") 142 | 143 | 144 | 145 | 146 | if text: 147 | doc_list = [nlp(text)] 148 | 149 | story_list = [{"event_text": text, "id": "123", "event_type": [event_type], "event_mode": [event_mode], "pub_date": pub_date}] 150 | 151 | if not event_type: 152 | story_list = event_model.process(story_list) 153 | if show_intermediate: 154 | with st.expander("Show event class step output", expanded=False): 155 | st.write(story_list) 156 | if not story_list[0]['event_type']: 157 | st.error("No event type detected.") 158 | st.stop() 159 | story_list = geolocation_model.process(story_list, doc_list) 160 | 161 | event_list = utilities.stories_to_events(story_list, doc_list) 162 | 163 | if show_intermediate: 164 | with st.expander("Show geolocation step output", expanded=False): 165 | st.write(event_list) 166 | 167 | event_list = attribute_model.process(event_list, doc_list) 168 | if show_intermediate: 169 | with st.expander("Show attribute step output", expanded=False): 170 | st.write(event_list) 171 | 172 | event_list = actor_resolution_model.process(event_list) 173 | if show_intermediate: 174 | with st.expander("Show actor resolution step output", expanded=False): 175 | st.write(event_list) 176 | 177 | st.markdown("### Final output") 178 | cleaned_events = formatter.process(event_list, return_raw=True) 179 | 180 | st.markdown(text) 181 | format_output(cleaned_events) 182 | 183 | with st.expander("Show raw final output", expanded=False): 184 | st.write(cleaned_events) 185 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # conda install -c huggingface tokenizers 2 | aiohttp==3.8.1 3 | aiosignal==1.2.0 4 | async-timeout==4.0.2 5 | attrs==21.4.0 6 | blis==0.7.6 7 | cachetools==5.0.0 8 | catalogue==2.0.6 9 | certifi==2021.10.8 10 | charset-normalizer==2.0.12 11 | click==8.0.4 12 | cymem==2.0.6 13 | cytoolz==0.11.2 14 | datasets==1.18.4 15 | dateparser==1.1.0 16 | dill==0.3.4 17 | filelock==3.6.0 18 | frozenlist==1.3.0 19 | fsspec==2022.2.0 20 | huggingface-hub==0.4.0 21 | idna==3.3 22 | jellyfish==0.9.0 23 | Jinja2==3.0.3 24 | joblib==1.1.0 25 | langcodes==3.3.0 26 | MarkupSafe==2.1.0 27 | mordecai3 @ file:///Users/ahalterman/MIT/Geolocation/mordecai3_scratch 28 | multidict==6.0.2 29 | multiprocess==0.70.12.2 30 | murmurhash==1.0.6 31 | networkx==2.7.1 32 | nltk==3.7 33 | numpy==1.22.3 34 | packaging==21.3 35 | pandas==1.4.1 36 | pathy==0.6.1 37 | Pillow==9.0.1 38 | preshed==3.0.6 39 | pyarrow==7.0.0 40 | pydantic==1.8.2 41 | pyparsing==3.0.7 42 | pyphen==0.12.0 43 | python-dateutil==2.8.2 44 | pytz==2021.3 45 | pytz-deprecation-shim==0.1.0.post0 46 | PyYAML==6.0 47 | regex==2022.3.2 48 | requests==2.27.1 49 | responses==0.18.0 50 | sacremoses==0.0.47 51 | scikit-learn==1.0.2 52 | scipy==1.8.0 53 | sentence-transformers==2.2.0 54 | sentencepiece==0.1.96 55 | six==1.16.0 56 | smart-open==5.2.1 57 | spacy==3.2.3 58 | spacy-legacy==3.0.9 59 | spacy-loggers==1.0.1 60 | srsly==2.4.2 61 | textacy==0.12.0 62 | thinc==8.0.13 63 | threadpoolctl==3.1.0 64 | tokenizers==0.11.6 65 | toolz==0.11.2 66 | torch==1.12.0.dev20220303 67 | torchaudio==0.12.0.dev20220303 68 | torchvision==0.13.0.dev20220303 69 | tqdm==4.63.0 70 | transformers==4.17.0 71 | typer==0.4.0 72 | typing_extensions==4.1.1 73 | tzdata==2021.5 74 | tzlocal==4.1 75 | urllib3==1.26.8 76 | wasabi==0.9.0 77 | xxhash==3.0.0 78 | yarl==1.7.2 79 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from setuptools import setup, find_packages 4 | setup( 5 | name = 'ngec', 6 | packages = find_packages(), 7 | ) -------------------------------------------------------------------------------- /setup/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | This directory includes code for setting up the offline Wikipedia index and for training custom event detection models. Please see each directory for code and documentation. -------------------------------------------------------------------------------- /setup/train_classifiers/README.md: -------------------------------------------------------------------------------- 1 | # Training classifiers for NGEC 2 | 3 | This directory contains example code for training event classifiers to use in the NGEC pipeline. 4 | 5 | Because of conditions imposed by our funder and the proprietary data used in the project, we cannot share the training data or the trained models used to produce the POLECAT event dataset. However, we can provide example code for training classifiers on your own data and demonstration pretrained models for the event categories used in the POLECAT dataset that draw on a corpus of pseudo-labeled synthetic news stories. These classifiers are not as accurate as the ones used in the POLECAT dataset, but work pretty well and could easily be improved with additional training data. For these demonstration classifiers, we use a synthetic data approach described in [Halterman (2023)](https://arxiv.org/abs/2303.16028), which prompts news articles with the desired event types by providing hand-written titles to elicit news from a language model. We then use the language model's predictions as pseudo-labels to train a classifier. 6 | 7 | Our primary objective with this pipeline is to encourage other researchers to develop custom event datasets for their own purposes. 8 | Most researchers will want to train custom classifiers using their own event ontologies, which requires generating new training data. 9 | 10 | ## Contents 11 | 12 | - `fit_event_classifier.py`: code to implement a simple multi-label, multi-class classifier. The core classification model is a logistic regression model on top of a sentence embedding produced by [sentence-transformer model](sentence-transformers/paraphrase-mpnet-base-v2). 13 | - `generate_synthetic_news.py`: code to generate synthetic news stories using an offline Huggingface pretrained language model. 14 | - `headlines_event_mode.csv`: a list of hand-written headlines used to prompt the language model to generate news stories. 15 | - `gpt_synthetic_events_2023-04-06.jsonl.zip`: a zip file containing around 1,800 synthetic news stories with event and mode pseudo-labels for training the event classifier. 16 | 17 | -------------------------------------------------------------------------------- /setup/train_classifiers/fit_event_classifier.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | import pandas as pd 3 | from sklearn.svm import SVC 4 | import numpy as np 5 | import skops.io as sio 6 | import os 7 | 8 | # path to sentence-transformer model directory if you've downloaded it 9 | # Leave blank to download from huggingface on the fly 10 | MODEL_DIR = "" 11 | #change device to 'cuda' if you have a GPU enabled 12 | DEVICE = "cpu" 13 | 14 | synth_df = pd.read_csv("gpt_synthetic_events_2023-10-19_19.csv.zip", 15 | compression="zip") 16 | 17 | 18 | def load_model(model_name="paraphrase-mpnet-base-v2"): 19 | if MODEL_DIR: 20 | # use the local copy 21 | model = SentenceTransformer(os.path.join(MODEL_DIR, model_name)) 22 | else: 23 | # otherwise, download from huggingface 24 | model = SentenceTransformer(f'sentence-transformers/{model_name}') 25 | return model 26 | 27 | model = load_model() 28 | encoded = model.encode(synth_df['text'].values, show_progress_bar=True, 29 | device=DEVICE).tolist() 30 | synth_df['encoded'] = encoded 31 | 32 | 33 | def fit_initial_model(synth_df): 34 | clf = SVC(class_weight="balanced", 35 | kernel="linear", 36 | probability=True, 37 | C=0.1) 38 | y_train = synth_df['label'] 39 | clf.fit(synth_df['encoded'].to_list(), y_train) 40 | pred = pd.DataFrame(clf.predict_proba(encoded)) 41 | # rename columns with the event names 42 | pred.columns = clf.classes_ 43 | return pred 44 | 45 | pred = fit_initial_model(synth_df) 46 | synth_df = pd.concat([synth_df, pred], axis=1) 47 | event_types = synth_df['label'].unique() 48 | 49 | for event in event_types: 50 | print(event) 51 | # First, sample the positive cases (assuming the prompts are reliable) 52 | train_pos_synth = synth_df[synth_df['label'] == event].copy() 53 | train_pos_synth['label'] = 1 54 | # Now sample negative cases, but don't pick anything that might 55 | # be a positive case 56 | candidate_neg = synth_df[synth_df[event] < 0.05].copy() 57 | # Take 3x as many negative cases as positive cases, or 58 | # as many as we can find 59 | sample_size = min(candidate_neg.shape[0], 60 | train_pos_synth.shape[0] * 3) 61 | train_neg_synth = candidate_neg.sample(sample_size).copy() 62 | train_neg_synth['label'] = 0 63 | # Now combine the positive and negative cases 64 | print(train_pos_synth.shape, train_neg_synth.shape) 65 | train = pd.concat([train_pos_synth, train_neg_synth], axis=0) 66 | X_train = np.array(train['encoded'].tolist()) 67 | y_train = train['label'] 68 | clf = SVC(class_weight="balanced", 69 | kernel="linear", 70 | probability=True) 71 | clf.fit(X_train, y_train) 72 | 73 | sio.dump(clf, f"models/{event}.skops") 74 | 75 | 76 | ## For production use, see https://github.com/ahalterman/NGEC/blob/main/NGEC/event_class.py 77 | 78 | 79 | 80 | 81 | ## A bunch of stuff that didn't really work 82 | 83 | # convert single label y to multi-label y 84 | #from sklearn.preprocessing import MultiLabelBinarizer 85 | #mlb = MultiLabelBinarizer() 86 | #y_train_bin = mlb.fit_transform([[i] for i in y_train]) 87 | #y_val_bin = mlb.fit_transform([[i] for i in y_val]) 88 | # 89 | ## train multi-label logistic regression model 90 | #clf = RandomForestClassifier(class_weight="balanced") 91 | #clf.fit(X_train, y_train_bin) 92 | #y_pred = clf.predict_proba(X_val) 93 | #print(classification_report(y_val_bin, y_pred)) 94 | # 95 | #from cleanlab.classification import CleanLearning 96 | # 97 | #cl = CleanLearning(clf) 98 | #cl.fit(X_train, y_train_bin) 99 | # 100 | ### One-by-one classifiers 101 | #from pulearn import WeightedElkanotoPuClassifier 102 | # 103 | #y_train = np.array(train['event'] == "ASSAULT").astype(int) * 2 - 1 104 | # 105 | #clf = LogisticRegression(C=0.4, class_weight="balanced") 106 | #clf.fit(X_train, y_train) 107 | # 108 | #y_val = np.array(val['event'] == "ASSAULT").astype(int) * 2 - 1 109 | #y_pred = clf.predict(X_val) 110 | #print(classification_report(y_val, y_pred)) 111 | # 112 | # 113 | ## Experimented with PU learning, but it didn't work well 114 | #from pulearn import WeightedElkanotoPuClassifier 115 | #pu_estimator = WeightedElkanotoPuClassifier( 116 | # estimator=clf, labeled=10, unlabeled=20, hold_out_ratio=0.2) 117 | #pu_estimator.fit(X_train, y_train) 118 | # 119 | #y_pred = pu_estimator.predict(X_val) 120 | #print(classification_report(y_val, y_pred)) -------------------------------------------------------------------------------- /setup/train_classifiers/generate_synthetic_news.py: -------------------------------------------------------------------------------- 1 | from transformers import pipeline, set_seed 2 | import re 3 | from tqdm import tqdm 4 | import random 5 | import pandas as pd 6 | import datetime 7 | 8 | text_generator = pipeline('text-generation', model='gpt2-xl', device=0) 9 | #set_seed(42) 10 | 11 | prompt = "Thousands of Soldiers Deployed to Czech Border to Address Unfolding Crisis\n\n(BBC Monitoring) --" 12 | text_generator(prompt, max_length=300) 13 | 14 | cities = ["Abuja", "Kabul", "Belgrade", "Zaghreb", "Khartoum", "Vienna", "Dhaka", "Brussels", 15 | "Minsk", "Kinshasa", "Beijing", "Bogota", "Sao Paulo", "Havana", "Berlin", "Prague", 16 | "Moscow", "Washington", "Cairo", "Jerusalem", "Dehli", "Tehran", "Rome", "Amman", 17 | "Beirut", "Tokyo", "Nairobi", "New York", "Panama City", "Oslo", "Damascus", 18 | "Bangkok", "Istanbul", "London", "Abu Dhabi"] 19 | 20 | c_df = pd.read_csv("countries.csv") 21 | countries = c_df['Name'].to_list() 22 | 23 | def make_stories(prompt, source, pattern, max_len=100, n=5): 24 | output = text_generator(prompt, 25 | max_length=max_len, 26 | num_return_sequences=n, 27 | pad_token_id=50256 28 | ) 29 | selected = [] 30 | for out in output: 31 | out['text'] = re.sub(re.escape(prompt), "", out['generated_text']) 32 | #toks = set([i.lower() for i in out['text'].split(" ")]) 33 | selected.append(out) 34 | final = [] 35 | for i in selected: 36 | disclaimer = "### THIS IS A SYNTHETIC STORY. DO NOT TRUST THE FACTUAL CONTENT OF THIS TEXT. Created by Andy Halterman to train a document-level political event classifer ###" 37 | text = disclaimer + i['text'].strip() 38 | d = {"text": text, 39 | "title": pattern['title'], 40 | "source": source, 41 | "prompt": prompt, 42 | "label": pattern['event'], 43 | "mode": pattern['mode']} 44 | final.append(d) 45 | return final 46 | 47 | def make_prompt_and_gen(pattern, 48 | source, 49 | max_len=100, 50 | unique_prompts=5, 51 | n_per_city=5): 52 | all_stories = [] 53 | for n in range(unique_prompts): 54 | city = random.sample(cities, 1)[0] 55 | country_1, country_2, country_3 = random.sample(countries, 3) 56 | headline = pattern['title'].format(country_1=country_1, 57 | country_2=country_2, 58 | country_3=country_3, 59 | city=city) 60 | prompt = f"{headline}\n\n({source}) --" 61 | stories = make_stories(prompt, source, pattern, n=n_per_city, max_len=max_len) 62 | all_stories.extend(stories) 63 | return all_stories 64 | 65 | 66 | def run(): 67 | patterns = pd.read_csv("synthetic_headlines.csv") 68 | patterns = patterns.sample(frac=1) 69 | 70 | all_output = [] 71 | for n, pattern in tqdm(patterns.iterrows(), total=patterns.shape[0]): 72 | if not pattern['title']: 73 | continue 74 | print(pattern['event']) 75 | for source in ['Reuters', 'AFP', 'BBC Monitoring', 'AP', 'local sources', 'local media']: 76 | out = make_prompt_and_gen(pattern, source, max_len=300, unique_prompts=2, n_per_city=1) 77 | all_output.extend(out) 78 | #except Exception as e: 79 | # print(e) 80 | df = pd.DataFrame(all_output) 81 | #df.to_csv("gpt_synthetic_events_cities.csv") 82 | today = datetime.datetime.today().strftime('%Y-%m-%d_%H') 83 | df.to_csv(f"gpt_synthetic_events_{today}.csv") 84 | 85 | 86 | if __name__ == "__main__": 87 | run() 88 | -------------------------------------------------------------------------------- /setup/train_classifiers/gpt_synthetic_events_2023-10-19_19.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/setup/train_classifiers/gpt_synthetic_events_2023-10-19_19.csv.zip -------------------------------------------------------------------------------- /setup/train_classifiers/synthetic_headlines.csv: -------------------------------------------------------------------------------- 1 | event,mode,title 2 | ACCUSE,,{country_1} Leader Criticizes {country_2}'s Response to Crisis 3 | ACCUSE,,"Activists Decry {city} Bombings, Highlighting Civilian Casualties" 4 | ACCUSE,DISAPPROVE,{country_1} Leader Condemns Human Rights Violations in Speech 5 | ACCUSE,DISAPPROVE,{country_1} condemns {country_2} for treatment of migrants 6 | ACCUSE,DISAPPROVE,{country_1} Citizens Express Widespread Disapproval for Leader in Recent Polls 7 | ACCUSE,INVESTIGATE,ICC States that it will Begin Investigation of Alleged War Crimes in {country_1} 8 | ACCUSE,INVESTIGATE,Investigation begins into last week's events 9 | ACCUSE,INVESTIGATE,Police in {country_1} begin probe into last week's scandal 10 | ACCUSE,INVESTIGATE,Commission forms to study the causes of last year's disaster 11 | ACCUSE,INVESTIGATE,International Body to Establish Inquiry into Possible {country_1} Genocide 12 | ACCUSE,ALLEGE,{country_1} President Charged with Human Rights Violations 13 | ACCUSE,ALLEGE,Six indicted by grand jury 14 | ACCUSE,ALLEGE,{country_1} brings official complaint against {country_2} for currency manipulation 15 | ACCUSE,ALLEGE,{country_1}: Former Politician Faces Charges in Bribery Scheme 16 | ACCUSE,DISAPPROVE,{country_1} Denounces Actions By {country_2} 17 | ACCUSE,DISAPPROVE,"{country_2} Spokesperson: ""{country_2}'s Behavior is Deplorable""" 18 | ACCUSE,DISAPPROVE,NGO in {city} Issues Statement Condemning Government 19 | ACCUSE,DISAPPROVE,Moderate Candidates in {country_1} Denounce Far-Right Party 20 | ACCUSE,DISAPPROVE,{country_1} Condemns Violence Against Civilians in Ongoing War 21 | ACCUSE,DISAPPROVE,Activists in {city} Decry New Government Policy 22 | ACCUSE,ALLEGE,"{country_1} Accuses {country_2} of Trade Violations, Vows WTO Case" 23 | ACCUSE,ALLEGE,Police in {city} Allege Local Companies Violated Law 24 | ACCUSE,ALLEGE,{city} Police Charge Dozens After Protest 25 | ACCUSE,ALLEGE,{country_1} Alleges {country_2} Violated Laws of War 26 | ACCUSE,ALLEGE,"{city} Man Sues Government, Alleging Rights Violations" 27 | ACCUSE,ALLEGE,Local NGO Accuses Government of Coverup 28 | ACCUSE,ALLEGE,Justice Ministry Brings Case Against Former Officials ({country_1}) 29 | ACCUSE,INVESTIGATE,{city} Police Investigating Shooting Death of Local Man 30 | ACCUSE,INVESTIGATE,{country_1} Convenes Grand Jury to Investigate Corruption 31 | ACCUSE,INVESTIGATE,{country_1} Begins Investigation Into Past Human Rights Violations 32 | ACCUSE,INVESTIGATE,{country_1} Launches Commission on {country_2} Abuses 33 | ACCUSE,INVESTIGATE,Activists Investigate Brutality During {country_1} War 34 | ACCUSE,INVESTIGATE,{country_1} Parliament Establishes Special Investigative Committee 35 | ACCUSE,INVESTIGATE,Police in {city} Appeal For Help in Investigation 36 | AGREE,,"{country_1}, {country_2} Agree to Hold Regular High-Level Meetings" 37 | AGREE,,{country_1} Leader Agrees to Provide Military Aid to {country_1} In Response to Conflict 38 | AGREE,,{country_1} Expresses Willingness to Cooperate on New Regional Security Framework 39 | AGREE,,Finance Ministers Gathered in {city} Agree to Impose New Banking Capital Requirements 40 | AGREE,,{country_1} and {country_2} have Promised to Ratify a New Treaty 41 | AGREE,,{country_1} Offers Humanitarian Support to {city} Following Yesterday's Disaster 42 | AGREE,,{country_1} and {country_2} Agree to New Limits on Conventional Weapons 43 | AGREE,,Mayor of {city} Reaches an Agreement with Striking Workers 44 | AGREE,,Leader of {country_1} Expresses New Willingness to Work with the International Community 45 | AGREE,,Scoop: Secret Negotiations in {city} Produce Promises to Support Rebels 46 | AID,,"{country_1}, {country_2} Sign Billion Dollar Aid Package" 47 | AID,,{country_1} Announces Transfer of Additional Humanitarian Aid to {country_2} in Wake of Disaster 48 | AID,,Military trainers and arms shipment arrive in war-torn {country_1} 49 | AID,,Emergency humanitarian aid arrives in famine-affected areas of {country_1} 50 | AID,,U.N. provides humanitarian aid in the wake of last week's disaster 51 | AID,,Four activists granted asylum in local embassy 52 | AID,,Major debt relief as IMF agrees to forgive millions owned by {country_1} 53 | ASSAULT,,{city} Ethnic Clashes Claim 11 Lives 54 | ASSAULT,,"26 Dead in Jihadist Attacks this Week, Media Reports" 55 | ASSAULT,AERIAL,{country_1}: Four killed in air strike 56 | ASSAULT,AERIAL,War planes pummel rebel positions in {country_1} 57 | ASSAULT,AERIAL,"Allied aircraft enforce no-fly-zone, shooting down {country_1} fighter plane" 58 | ASSAULT,ABDUCT,"{country_1} Mayor Kidnapped, Killed" 59 | ASSAULT,ABDUCT,13 Students Abducted from School in {city} by Insurgent Group 60 | ASSAULT,BEAT,Police Searching for Suspects after Prominent Elder Beaten 61 | ASSAULT,BEAT,{country_1} Police Officers Beat Protestors at Opposition Rally 62 | ASSAULT,TORTURE,{country_1}: Widespread Torture During Government Occupation 63 | ASSAULT,TORTURE,"{country_1} Responsible for Torture, Killing of Prominent Activist" 64 | ASSAULT,EXECUTE,Video Purpotedly Shows Execution of Second Hostage 65 | ASSAULT,EXECUTE,Two Local Administrators Beheaded by Insurgents in South {country_1} 66 | ASSAULT,SEXUAL,Army Urged to End Tactics of Sexual Violence During Ongoing Offensive in West {country_1} 67 | ASSAULT,SEXUAL,"Survivors of Attack Describe Rapes, Other Atrocities" 68 | ASSAULT,ASSASSINATE,{country_1} Lawmaker Killed in Targeted Attack in {city} 69 | ASSAULT,ASSASSINATE,Reporter Shot and Killed by Masked Gunmen in {city} 70 | ASSAULT,DESTROY,{country_1} Military Razes Two Homes in Response to Deadly Attack 71 | ASSAULT,DESTROY,"Bandits Injure Six, Raze 47 Houses in {city} Villages" 72 | ASSAULT,PRIMATIVE,Militants Behead Local Official in {city} Over Government Policy 73 | ASSAULT,PRIMATIVE,Priest Burned to Death in Southern {country_1} 74 | ASSAULT,PRIMATIVE,Angry mob throws rocks and bottles 75 | ASSAULT,PRIMATIVE,Local opposition leader beaten with baseball bat 76 | ASSAULT,FIREARMS,"Gunmen Storm Agency Headquarters, Kill Police Officers, Several Civilians" 77 | ASSAULT,FIREARMS,Gunmen Kill 13 in Fresh Attack on {city} 78 | ASSAULT,EXPLOSIVES,"{country_1} official: Blast rocks country's capital, killing 11" 79 | ASSAULT,EXPLOSIVES,"Eight Killed, over 45 Injured in {city} Bomb Explosion" 80 | ASSAULT,SUICIDE-ATTACK,Extremist Group Claims Responsibility for Recent Suicide Bombing 81 | ASSAULT,SUICIDE-ATTACK,Suicide Car Bomb in {city} Kills At Least 44 82 | ASSAULT,AERIAL,"{country_1} Air Force Bombs Training Camp, No Survivors" 83 | ASSAULT,AERIAL,{country_1} Army Says 14 Killed in Air Strike Were Terrorists 84 | ASSAULT,DRONE,"Drone Strike Killed 2 Civillians, Family Says" 85 | ASSAULT,DRONE,Drone strikes increase as {country_1} conflict intensifies 86 | ASSAULT,DRONE,{country_1} UAV destroys enemy targets in {country_2} 87 | ASSAULT,DRONE,{country_1} Kills Senior Commander in {country_2} With Drone Strike 88 | ASSAULT,HEAVY-WEAPONS,Shelling in {city} Kills Six Children 89 | ASSAULT,HEAVY-WEAPONS,Border Shelling Kills Three Civilians in Renewed Tensions between {country_1} and {country_2} 90 | ASSAULT,CROWD-CONTROL,4 Injured as Police Turn on Protestors in {city} 91 | ASSAULT,CROWD-CONTROL,Witness: Military Uses Tear Gas on Rally in Central {city} 92 | ASSAULT,CLEANSING,Mass Deportations Reported in {country_1} in Lead Up to Summit 93 | ASSAULT,CLEANSING,{country_1} Conflict Enters New Stage with Ethnic Cleansing 94 | ASSAULT,MASSACRE,{country_1} Bandits Kill 75 in Pre-dawn Massacre: Family Members 95 | ASSAULT,MASSACRE,"Militants Attack {city} Village, Killing 65 Civilians" 96 | ASSAULT,UNCONVENTIONAL,Observers Report Use of Chemical Weapons on Civilians in {country_1} 97 | ASSAULT,UNCONVENTIONAL,Gas Attack in {city} Kills Scores Amidst Government Denials 98 | COERCE,,"Restrictions to Individual Rights, Media Freedoms Imposed in {country_1} Following Transition" 99 | COERCE,,Outrage over {country_1} Repression 100 | COERCE,SIEZE,"Police Raid Human Rights Organization Offices, Seize Computers " 101 | COERCE,SIEZE,Observers Worry About Indiscriminate Police Raids on Ethnic Community in {country_1} 102 | COERCE,RESTRICT,{country_1} Village under Military Lockdown Reportedly Running Out of Food 103 | COERCE,RESTRICT,{country_1} City Blockaded by {country_2} Troops 104 | COERCE,BAN,{country_1}: Three Human Rights NGOs Banned for Ties to Foreign Governments 105 | COERCE,BAN,{country_1} Independence Party Formally Outlawed Under New Law 106 | COERCE,CENSOR,{country_1} Tightens the Screws on Media Freedoms 107 | COERCE,CENSOR,Social Media Platforms Temporarily Suspended in North {country_1} Following Recent Unrest 108 | COERCE,CURFEW,"Turmoil in {country_1}: Curfew Imposed After Clashes, Murders" 109 | COERCE,CURFEW,Nationwide Curfew Imposed Following Recent Protests 110 | COERCE,MARTIAL-LAW,{country_1} Imposes Martial Law in Response to Election Violence 111 | COERCE,MARTIAL-LAW,Prime Minister to Extend Martial Law for Another Week 112 | COERCE,ARREST,Arbitrary Arrests in {country_1} Condemned by International Observers 113 | COERCE,ARREST,{country_1} Arrests 20 in Regional Crackdown 114 | COERCE,DEPORT,{country_1} Expels Human Rights Activist in Response to Accusations 115 | COERCE,DEPORT,Failed Asylum Seeker Deported Back to {country_1} 116 | COERCE,WITHHOLD,{country_1} Cuts Off Internet in Some Provinces In Light of Unrest 117 | COERCE,WITHHOLD,Internet Shutdown in {country_1} Begins In Anticipation of Renewed Violence 118 | COERCE,MISINFORMATION,{country_1} Says Use of Misinformation Response to Ongoing External Threat from {country_2} 119 | COERCE,MISINFORMATION,Facebook admits to 'coordinated misinformation' aimed at upcoming election in {country_1} 120 | COERCE,MISINFORMATION,Foreign 'information operation' spreading disinformation uncovered in {country_1} 121 | COERCE,MISINFORMATION,Incumbent Spreading Fake News Ahead of Upcoming {country_1} Election 122 | COERCE,MISINFORMATION,State actors are behind disinformation campaign says {country_1} 123 | COERCE,CYBER,{country_1} News Site Hit by Cyber Attack 124 | COERCE,CYBER,"Cyber warfare is already happening, says {country_1} government spokesperson" 125 | COERCE,CYBER,{country_1} Cyberattack Takes Down Communications Network for Several Hours 126 | COERCE,CYBER,Power plant disabled by cyber attack 127 | COERCE,CYBER,Foreign hackers target government network in {country_1} 128 | CONCEDE,,Mayor of {city} Promises to End Curfew After Upcoming Elections 129 | CONCEDE,,Candidate Promises to Relax Restrictions on Migration if Elected 130 | CONCEDE,,Mayor of {city} Announced that Police Will Not Beat Protestors 131 | CONCEDE,,Demonstrators Claim Victory as {country_1} Agrees to All Demands 132 | CONCEDE,,Rebels in {country_1} Have Announced Their Intention to Lay Down Arms 133 | CONCEDE,,{country_1}: Evening Curfew will be Lifted Starting Tomorrow 134 | CONCEDE,,Workers Will End Strike After {country_1} Promises Improved Working Conditions 135 | CONCEDE,,{country_1} Will Withdraw Complaint at the WTO Against {country_2} 136 | CONCEDE,,{country_1} Will Shutter Controversal Project After Massive Objections 137 | CONCEDE,,{country_1} Agrees to Withdraw from Seized Territory 138 | CONCEDE,,{country_1} Gives Into Demands Made By {country_2} 139 | CONCEDE,,Government of {country_1} Agrees to Concessions in Order to Avert Crisis 140 | CONCEDE,,Concessions Made By {country_1} in Order to Avoid Escalation of Conflict 141 | CONCEDE,,Concessions Made in Order to Secure Peace 142 | CONCEDE,,Government in {country_1} Makes Concessions in Face of Pressure 143 | CONCEDE,,{city}: Opposition Group Concedes After Meeting with Prime Minister 144 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Immigration 145 | CONCEDE,,{country_1} Makes Concessions to the EU 146 | CONCEDE,,Rebel Group Makes Concessions to the {country_1} Government on Peace Talks 147 | CONCEDE,,{country_1} Makes Concessions to {country_2} on {country_3} Militants 148 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Arms Embargo 149 | CONCEDE,,Political group agrees to halt protests in return for promises from {country_1} 150 | CONCEDE,,{country_1} agrees to remove legal restrictions on local ethnic group 151 | CONCEDE,,Activists agrees to suspend protests in return for rebels in {country_1} agreeing to a ceasefire 152 | CONCEDE,,Rebels in {country_1} agree to stop fighting in return for {country_1}'s future concessions 153 | CONCEDE,,{country_1} declares cease-fire with {country_2} 154 | CONCEDE,,"Administrative restrictions lifted on organization following talks with {country_1} officials""" 155 | CONCEDE,,Curfew lifted in {city} following agreement between mayor and protesters 156 | CONCEDE,,{country_1} Promises Concessions to Opposition 157 | CONCEDE,,{country_1} Makes Concessions to {country_2} in Nuclear Talks 158 | CONCEDE,,{country_1} eases administrative restrictions on civil society organizations 159 | CONCEDE,,{city} authorities agree to remove curfew in restive city 160 | CONCEDE,,{country_1} opposition suspends protests for two weeks 161 | CONCEDE,,Parties in {country_1} war declare ceasefire and agree to withdrawal 162 | CONCEDE,,{country_1} makes verbal commitment to ease economic restrictions 163 | CONCEDE,,Reformers gain important concessions in talks with {country_1} 164 | CONCEDE,,President of {country_1} Suspends Plans to Impose Curfews in Capital 165 | CONCEDE,,Israeli Prime Minister Benjamin Netanyahu agrees to halt settlement construction in the West Bank 166 | CONCEDE,,"Rebels in {country_1} announce suspension of attacks against government forces""" 167 | CONCEDE,,Rebels in {country_1} declare a nationwide ceasefire 168 | CONCEDE,,{city}: Opposition Group Concedes After Meeting with Prime Minister 169 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Immigration 170 | CONCEDE,,{country_1} Makes Concessions to the EU 171 | CONCEDE,,Rebel Group Makes Concessions to the {country_1} Government on Peace Talks 172 | CONCEDE,,{country_1} Makes Concessions to {country_2} on {country_3} Militants 173 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Arms Embargo 174 | CONCEDE,,Political group agrees to halt protests in return for promises from {country_1} 175 | CONCEDE,,{country_1} agrees to remove legal restrictions on local ethnic group 176 | CONCEDE,,Activists agrees to suspend protests in return for rebels in {country_1} agreeing to a ceasefire 177 | CONCEDE,,Rebels in {country_1} agree to stop fighting in return for {country_1}'s future concessions 178 | CONCEDE,,{country_1} declares cease-fire with {country_2} 179 | CONCEDE,,"Administrative restrictions lifted on organization following talks with {country_1} officials""" 180 | CONCEDE,,Curfew lifted in {city} following agreement between mayor and protesters 181 | CONCEDE,,{country_1} Promises Concessions to Opposition 182 | CONCEDE,,{country_1} Makes Concessions to {country_2} in Nuclear Talks 183 | CONCEDE,,{country_1} eases administrative restrictions on civil society organizations 184 | CONCEDE,,{city} authorities agree to remove curfew in restive city 185 | CONCEDE,,{country_1} opposition suspends protests for two weeks 186 | CONCEDE,,Parties in {country_1} war declare ceasefire and agree to withdrawal 187 | CONCEDE,,{country_1} makes verbal commitment to ease economic restrictions 188 | CONCEDE,,Reformers gain important concessions in talks with {country_1} 189 | CONCEDE,,President of {country_1} Suspends Plans to Impose Curfews in Capital 190 | CONCEDE,,Israeli Prime Minister Benjamin Netanyahu agrees to halt settlement construction in the West Bank 191 | CONCEDE,,"Rebels in {country_1} announce suspension of attacks against government forces""" 192 | CONCEDE,,Rebels in {country_1} declare a nationwide ceasefire 193 | CONSULT,,{country_1}'s Foreign Minister Embarks on Tour of Neighboring Countries 194 | CONSULT,,Negotiations Ongoing at International Climate Conference 195 | CONSULT,VISIT,{country_1} Leader Visits {country_2} to Deepen Ties 196 | CONSULT,VISIT,{country_1} President Visits {city} As Part of Multi-country Tour 197 | CONSULT,THIRD-PARTY,{country_1} Hosts Peace Talks Between {country_2} Government and Rebels in {city} 198 | CONSULT,THIRD-PARTY,Meetings Between {country_1} and {country_2} Representatives Begin in {city} as {country_1} Attempts to Broker Ceasefire 199 | CONSULT,MULTILATERAL,"Key Outcome Agreed to at Recent Global Summit over Climate Change, Environment" 200 | CONSULT,MULTILATERAL,Multilateral Talks Begin in {city} Over Next Phase of Trade Integration 201 | CONSULT,PHONE,{country_1} President Defends his Call to {country_2} Leader Over Dispute 202 | CONSULT,PHONE,"{country_1}, {country_2}, Military Leaders Speak by Phone Amid Heightened Tensions over {country_3}" 203 | COOPERATE,,{country_1} and {country_2} Sign Trade Deal in Renewed Push for Economic Cooperation 204 | COOPERATE,,"{country_1}, {country_2}, Hold Joint Military Drills in Southeast {country_1}" 205 | COOPERATE,,{country_1} and {country_2} Announce Common Currency 206 | COOPERATE,,{country_1} and {country_2} Expand Intelligence Sharing 207 | COOPERATE,,{country_1} and {country_2} Expand Judicial Cooperation 208 | COOPERATE,,{country_1} and {country_2} Begin Joint Military Exercises 209 | COOPERATE,,NGOs Gathered in {city} Establish New Working Group 210 | COOPERATE,,Rebels in {country_1}'s Civil War Begin Operating Under New Joint High Command 211 | COOPERATE,,Record-Breaking Trade Between {country_1} and {country_2} 212 | COOPERATE,,Troops From {country_1} and {country_2} Held Joint Military Maneuvers 213 | COOPERATE,,{country_1} Begins Importing Goods from {country_2} 214 | COOPERATE,,{city} Businesses Increase Investment in {country_1} 215 | COOPERATE,,Team From {city} Arrives in {country_1} To Begin Cooperation 216 | COOPERATE,,{country_1} and {country_2} Sign New Extradition Treaty 217 | COOPERATE,,{country_1} Started Exporting Goods to {country_2} On Wednesday 218 | COOPERATE,,Bilateral Trade Between {country_1} and {country_2} Growing Rapidly 219 | COOPERATE,,Leaders from {country_1} and {country_2} Broke Ground on New Joint Infrastructure Project 220 | COOPERATE,,{country_1} and {country_2} Ratify New Cooperation Agreement 221 | COOPERATE,,Manufacturers in {country_1} Create Joint Venture With Factories in {country_2} 222 | COOPERATE,,{country_1} Warship Conducts Joint Training Exercises with {country_2} Navy 223 | MOBILIZE,,Thousands of Soldiers Deployed to {country_1} Border to Address Unfolding Crisis 224 | MOBILIZE,,{country_1} Increases Military Readiness in Response to {country_2} 225 | MOBILIZE,TROOPS,{country_1} Army Calls Upon Conscripts in Preparation for Renewed Offensive 226 | MOBILIZE,TROOPS,Thousands of Troops Deployed to southern {country_1} Amid Simmering Tensions 227 | MOBILIZE,WEAPONS,Military Ramps up Production in Anticipation of Conflict with {country_1} 228 | MOBILIZE,WEAPONS,{country_1} Deploys Missile Defense System to Bolster Position 229 | MOBILIZE,POLICE,Hundreds of Police Called to Capital Following Recent Violence 230 | MOBILIZE,POLICE,Governor orders police to {city} center 231 | MOBILIZE,POLICE,Police Deployed Ahead of {city} Protests 232 | MOBILIZE,MILITIA,Militia Expands Recruitment in Northwest As Talks Fail 233 | MOBILIZE,MILITIA,"Pro-government militia begins operating near the {country_1} capital 234 | " 235 | MOBILIZE,MILITIA,Pro-government aaramilitary forces begin patrols in {country_1} 236 | MOBILIZE,MILITIA,Far-right paramilitary group formed in {country_1} 237 | MOBILIZE,MILITIA,{country_1} Militia Members Called into Action over Recent Tensions 238 | PROTEST,,Hundreds Rally in Capital over Recent Election Violence 239 | PROTEST,,Demonstrators Unite across {country_1} as Protests Spread 240 | PROTEST,DEMO,{country_1}: Peaceful Demonstration Target of Violent Crackdown 241 | PROTEST,DEMO,Dozens march in downtown demonstration 242 | PROTEST,DEMO,Peaceful protests begin across {city} 243 | PROTEST,DEMO,"Over 5,000 Unite for Peaceful Demonstration in Central {city}" 244 | PROTEST,RIOT,{city} Police Station Burned Down after Night of Rioting 245 | PROTEST,RIOT,Riots break out across {country_1} following ethnic violence 246 | PROTEST,RIOT,Shops smashed by violent protestors in {country_1} 247 | PROTEST,RIOT,Recent Ethnic Riots in {country_1} Cast Doubt on Transition to Democracy 248 | PROTEST,RIOT,Police respond with force to rioters in the capital 249 | PROTEST,STRIKE,Transportation Workers Join Nationwide Strike in {country_1} 250 | PROTEST,STRIKE,Over Half of {country_1} Teachers Now Out on Wildcat Strike 251 | PROTEST,STRIKE,Union negotiations collapse as labor strike begins 252 | PROTEST,STRIKE,Workers in {country_1} announce total work stoppage 253 | PROTEST,HUNGER,Jailed {country_1} Activist on Hunger Strike in {country_2} 254 | PROTEST,HUNGER,{country_1}: Imprisoned Regime Critic Announces Hunger Strike 255 | PROTEST,HUNGER,Activists continue hunger strike 256 | PROTEST,BOYCOTT,11 Countries Shun Upcoming Summit Due to Concerns over Human Rights 257 | PROTEST,BOYCOTT,"{country_1}, {country_2}, Announce Boycotts of Upcoming {city} Conference" 258 | PROTEST,BOYCOTT,Local citizens begin boycotting businesses 259 | PROTEST,OBSTRUCT,Protest Continues to Disrupt Traffic on Highway near {city} 260 | PROTEST,OBSTRUCT,Protestors Block {country_1} Border Near {city} 261 | PROTEST,OBSTRUCT,Protestors block highway as part of demonstration 262 | REJECT,,{country_1} Government and Rebels Reject Offer of International Mediation 263 | REJECT,,{country_1} Government Rejects Request to Delay Upcoming Elections 264 | REJECT,ASSIST,Widespread Suffering as {country_1} Government Refuses Medical Aid 265 | REJECT,ASSIST,{country_1} Government Declines Humanitarian Assistance to {country_2} As Conditions Worsen 266 | REJECT,CHANGE,{country_1} Voters Reject Referendum on Peace Process 267 | REJECT,CHANGE,{country_1} Rejects Peace Agreement Between {country_2} and Rebels 268 | REJECT,YIELD,{city} Government Declines to End Weekend Curfew Amid Criticism from Rights Groups 269 | REJECT,YIELD,Rebels in {city} Reject Calls for Ceasefire After Mediation Efforts 270 | REJECT,MEET,{country_1} Leader Said to Reject Meeting with {country_1} Head of State During {city} Summit 271 | REJECT,MEET,{country_1}: Chief of Armed Forces Declines Request for High-level Meeting with {country_2} Counterpart 272 | REQUEST,,{country_1}: Presidential Candidate Demands Formal Review of {country_2} Tariffs 273 | REQUEST,,"Protestors Demand Faster Action on Unemployment, Inflation" 274 | REQUEST,ASSIST,"{country_1} asks {country_2} for Military, Economic Aid Amidst Ongoing Crisis" 275 | REQUEST,ASSIST,{country_1} Leader Expresses Need for International Assistance During Lockdown 276 | REQUEST,CHANGE,Activists in {country_1} Request End to Controversial Migration Policy 277 | REQUEST,CHANGE,Foreign Business Demand End to Discriminatory Policies in {city} 278 | REQUEST,YIELD,Human Rights Group Asks for Release of Final Political Prisoner Held in {country_1} 279 | REQUEST,YIELD,{country_1} Asks {country_2} to End Sanctions on Energy Sector 280 | REQUEST,MEET,{country_1} Leader Reportedly Requests Meeting with {country_2}'s President 281 | REQUEST,MEET,Peacekeepers Urge Rebel Leader to Meet with {country_1} Military in Renewed Mediation Push 282 | RETREAT,,{country_1} and {country_2} Establish Repatriation Deal 283 | RETREAT,,"After 13 Years, All Sides Agree to End {country_1} Conflict" 284 | RETREAT,WITHDRAW,"{country_1} Military Withdrawal from {country_2} Leads to Chaos, Uncertainty" 285 | RETREAT,WITHDRAW,Military forces withdraw from {country_1} 286 | RETREAT,WITHDRAW,{country_1} will begin the process of withdrawing some of their military forces from {country_2} 287 | RETREAT,WITHDRAW,{country_1} Army pulls forces out of contested region 288 | RETREAT,WITHDRAW,Insurgents Announce Withdrawal from Northern {country_1} Villages 289 | RETREAT,RELEASE,{country_1} Releases Three Prominent Labor Rights Activists 290 | RETREAT,RELEASE,"After Three Years of Negotiation, {country_1}'s Rebel Group Releases Hostages" 291 | RETREAT,RELEASE,Dozens of captives released in {city} 292 | RETREAT,RETURN,Court Rules that Police Must Return Confiscated Vehicle Seized as Evidence During Raid 293 | RETREAT,RETURN,{country_1} Rebels Return Seized Property as Part of Peace Accord 294 | RETREAT,DISARM,{country_1}'s Remaining Rebel Factions Agree to Disarm in Ongoing Push for Peace 295 | RETREAT,DISARM,Militia lays down arms in disarmament agreement 296 | RETREAT,DISARM,"Following Transition, {country_1} Rebels Formally Disband" 297 | RETREAT,CEASEFIRE,"{country_1}, {country_2}, Sign Ceasefire over Border Dispute" 298 | RETREAT,CEASEFIRE,Ceasefire begins in war-torn region of {country_1} 299 | RETREAT,CEASEFIRE,Hostilities temporarily stop as talks begin in {city} 300 | RETREAT,CEASEFIRE,All sides in {country_1} Conflict Agree to Implement Ceasefire 301 | RETREAT,ACCESS,{country_1} Peacekeepers Arrive in {country_2} 302 | RETREAT,ACCESS,Election Observers Allowed Back into {country_1} After Reforms Implemented 303 | RETREAT,RESIGN,{country_1} Leader Quits Following Corruption Accusations 304 | RETREAT,RESIGN,Mayor of {city} Resigns Amidst Investigation into Wrongdoing 305 | SANCTION,,{country_1} Suspends Relations with the West in Ongoing Tensions 306 | SANCTION,,{country_1} Downgrades Diplomatic Ties with {country_2} Amid Ongoing Spat 307 | SANCTION,CONVICT,{country_1}: Warlord Guilty of Crimes Against Humanity 308 | SANCTION,CONVICT,Court hands down convictions in corruption case 309 | SANCTION,CONVICT,Four found guilty after lengthy trial 310 | SANCTION,CONVICT,Former leader of {country_1} sentenced to prison for serious crimes 311 | SANCTION,CONVICT,{country_1} General Convicted of War Crimes and Genocide at UN Tribunal 312 | SANCTION,EXPEL,{country_1} Expels Dozens of Diplomats Following Accusations 313 | SANCTION,EXPEL,{country_1} Leader Ejects International Observers in {city} 314 | SANCTION,WITHDRAW,Rebels Quit {country_1} Peace Talks 315 | SANCTION,WITHDRAW,Human Rights Group Withdraws from {country_1} Amid Increased Security Concerns 316 | SANCTION,DISCONTINUE,{country_1} Faces Shortages as Sanctions Take Hold 317 | SANCTION,DISCONTINUE,{country_1} Imposes Sanctions on Company Linked to {country_2} 318 | SUPPORT,,{country_1} Renews its Support of the World Health Organization 319 | SUPPORT,,{country_1} Citizens Express Support for Ongoing Protests in {city} 320 | SUPPORT,,{country_1} and {country_2} Establish Diplomatic Ties 321 | SUPPORT,,{country_1} and {country_2} Sign Trade Agreement 322 | SUPPORT,,{country_1} Expresses Support for {country_2}'s Policy 323 | SUPPORT,,{country_1} Commends {country_2} for Their Action 324 | SUPPORT,,{country_1} Ratifies Treaty Signed by {country_2} 325 | SUPPORT,,{country_1} and {country_2} Resume Diplomatic Relations 326 | SUPPORT,,{country_1} and {country_2} Improve Diplomatic Cooperation 327 | SUPPORT,,{country_1} and {country_2} Expand Diplomatic Ties 328 | SUPPORT,,{country_1} Approves of {country_2}'s Policy 329 | SUPPORT,,{country_1} and {country_2} Cooperate on Climate Change 330 | SUPPORT,,{country_1} and {country_2} Express Mutual Respect for Each Other 331 | SUPPORT,,{country_1} commends {country_2} for taking steps to improve human rights 332 | SUPPORT,,{country_1} and {country_2} ratify agreement on cultural exchange 333 | SUPPORT,,{country_1} thanks {country_2} for support on political issues 334 | SUPPORT,,{country_1} commends {country_2}'s handling of recent crisis 335 | SUPPORT,,{country_1} and {country_2} commit to working together to resolve regional conflict 336 | SUPPORT,,{country_1} and {country_2} sign landmark peace agreement 337 | SUPPORT,,{country_1} and {country_2} pledge to increase cooperation 338 | SUPPORT,,{country_1} and {country_2} Establish Diplomatic Ties 339 | SUPPORT,,{country_1} and {country_2} Sign Trade Agreement 340 | SUPPORT,,{country_1} Expresses Support for {country_2}'s Policy 341 | SUPPORT,,{country_1} Commends {country_2} for Their Action 342 | SUPPORT,,{country_1} Ratifies Treaty Signed by {country_2} 343 | SUPPORT,,{country_1} and {country_2} Resume Diplomatic Relations 344 | SUPPORT,,{country_1} and {country_2} Improve Diplomatic Cooperation 345 | SUPPORT,,{country_1} and {country_2} Expand Diplomatic Ties 346 | SUPPORT,,{country_1} Approves of {country_2}'s Policy 347 | SUPPORT,,{country_1} and {country_2} Cooperate on Climate Change 348 | SUPPORT,,{country_1} and {country_2} Express Mutual Respect for Each Other 349 | SUPPORT,,{country_1} commends {country_2} for taking steps to improve human rights 350 | SUPPORT,,{country_1} and {country_2} ratify agreement on cultural exchange 351 | SUPPORT,,{country_1} thanks {country_2} for support on political issues 352 | SUPPORT,,{country_1} commends {country_2}'s handling of recent crisis 353 | SUPPORT,,{country_1} and {country_2} commit to working together to resolve regional conflict 354 | SUPPORT,,{country_1} and {country_2} sign landmark peace agreement 355 | SUPPORT,,{country_1} and {country_2} pledge to increase cooperation 356 | THREATEN,,Activists Raise Potential of Protests if Election Fraud Ignored 357 | THREATEN,,Union Workers Threaten Strike if Deal is Not Reached by Midnight 358 | THREATEN,RESTRICT,Government Floats Possibility of Major Restrictions on Movement over Protests 359 | THREATEN,RESTRICT,{country_1} Militants Threaten to Restrict Shipping Lanes 360 | THREATEN,BAN,{country_1} Suggests it May Ban Former General from Running in Election 361 | THREATEN,BAN,Officials in {country_1} threaten to ban opposition political parties 362 | THREATEN,BAN,{country_1} threatens to ban civil society organizations 363 | THREATEN,BAN,{country_1} Leader Raises Possibility of Bans on Opposition Parties 364 | THREATEN,ARREST,{country_1} Government to Detain Journalists if they Publicize Fraud Allegations 365 | THREATEN,ARREST,Police threaten to make arrests as unrest continues in {country_1} 366 | THREATEN,ARREST,Officials promise arrests if corruption continues 367 | THREATEN,ARREST,{country_1} Police Threaten Protestors with Arrest 368 | THREATEN,RELATIONS,Leader Says {country_1} Could Suspend Relations with Neighbor over Deal 369 | THREATEN,RELATIONS,Negotiators from {country_1} threaten to walk away from ongoing talks 370 | THREATEN,RELATIONS,{country_1} Raises Potential of Suspending Talks with {country_2} over Tensions 371 | THREATEN,EXPEL,{country_1} Threatens to Expel International Observers 372 | THREATEN,EXPEL,{country_1} Warns {country_2} of Retaliation after Diplomats Expelled 373 | THREATEN,TERRITORY,{country_1} Threatens to Annex Territory if Conditions Are Not Met 374 | THREATEN,TERRITORY,Why is {country_1} Threatening to Invade {country_2}? 375 | THREATEN,TERRITORY,{country_1} warns {country_2} that it plans to invade 376 | THREATEN,VIOLENCE,Insurgents Promise to Renew Bombing Campaign in South {country_1} if Talks Fail 377 | THREATEN,VIOLENCE,{country_1} Leader Threatens to End Protests with Military Force -------------------------------------------------------------------------------- /setup/wiki/README.md: -------------------------------------------------------------------------------- 1 | ## Wikipedia in Elasticsearch 2 | 3 | The files in this repo will help you set up a an offline Wikipedia in Elasticsearch for easy querying. 4 | 5 | ## Quickstart 6 | 7 | To get started fast, you can download a pre-built Elasticsearch volume you can use in Docker. See the main NGEC/README.md page for quick-start instructions. 8 | 9 | ## Building a Wikipedia index 10 | 11 | You might have a few reasons for building your own offline Wikipedia index: 12 | 13 | - Using a non-English Wikipedia 14 | - Creating an updated Wikipedia if the pre-built one is stale 15 | - Customizing the format or structure of the index (if so, you'll need to modify the code) 16 | 17 | ## Index format 18 | 19 | After ingesting, each Wikipedia article will be stored in the following form: 20 | 21 | - `title`: the title of the Wikipedia page (no underscores) 22 | - `redirects`: every page that redirects to this page 23 | - `alternative_names`: alternative names for the article, identified from bold phrases in the first sentence 24 | - `short_desc`: Wikipedia's "short description" of the article 25 | - `categories`: the Wikipedia categories associated with this page 26 | - `intro_para`: the cleaned text of the first paragraph of the article. All text after the intro paragraph is discarded for space reasons. 27 | - `infobox`: if the article includes a side infobox, it will be stored here. 28 | - `box_type`: articles can have different box formats, e.g. "legislature", "military unit", "settlement" 29 | - `affiliated_people`: the contents of the 'leaders', 'founded_by', or 'founder' fields if present in the infobox. (This ends up not being used) 30 | 31 | ## Setup 32 | 33 | First, make sure that Redis is installed and running: 34 | 35 | ``` 36 | sudo apt-get install redis-server 37 | ``` 38 | 39 | Alternatively, you can use a Docker container: 40 | 41 | ```bash 42 | docker run -d -p 6379:6379 --name redis redis 43 | ``` 44 | 45 | Then, install the Python requirements: 46 | 47 | ```bash 48 | pip install -r requirements.txt 49 | ``` 50 | 51 | 52 | ## Running 53 | 54 | 55 | **NOTE**: If you want, you can skip the first step (building the Wiki redirect file) by downloading a prebuilt pickle of the redirect dictionary from Google Drive. This is by far the slowest step (it can take up to 24 hours on a slow machine), but there are security concerns about downloading pickle files from the internet. If you're comfortable with running an untrusted pickle file, you can download the file from [here](https://drive.google.com/file/d/1zJviHKAm0bQH9xaq5p-dUrVnknDlFgJK/view?usp=sharing) and place it in the `setup` directory. 56 | 57 | To run the entire process, you can run the following command: 58 | 59 | ```bash 60 | bash create_index.sh 61 | ``` 62 | 63 | This will run all commands to: 64 | 65 | - set up Elasticsearch 66 | - download the English Wikipedia dump 67 | - go through the Wikipedia dump and identify all redirects (see note above--this is slow) 68 | - store those redirects in Redis for easy querying 69 | - go through Wikipedia again, 70 | - parsing each article 71 | - looking up alternative names in Redis 72 | - loading the formatted article into Elasticsearch 73 | 74 | Alternatively, you can run each command in the bash file separately in a terminal so you don't have to re-run everything if you encounter an error somewhere. 75 | 76 | ## 2. Updating the index 77 | 78 | To update the index with a new copy of Wikipedia, you should be able to do the following: 79 | 80 | 1. delete the existing Wikipedia index, but don't destroy the entire Elasticsearch container (I did this, and then realized that I'd also nuked the Geonames index, which thankfully only takes about 30 minutes to rebuild) 81 | 2. delete the old Wikipedia file and re-download: `wget "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"` 82 | 3. Re-do the Elasticsearch mapping with the new wiki_mapping.json file that separates out alternative names and redirects (and changes the way the indexing is done): `curl -XPUT 'localhost:9200/wiki' -H 'Content-Type: application/json' -d @wiki_mapping.json` 83 | 4. SKIP the "build links" step: that's already taken care of with the downloaded file. This step is absurdly slow. 84 | 5. Load the redirects file into Redis: `python load_wiki_es.py load_redis` 85 | 6. Load Wikipedia into elasticsearch: `python load_wiki_es.py load_es` 86 | 87 | 88 | **A few caveats**: 89 | 90 | - it assumes you'll run Elasticsearch in a Docker container. You may instead want to run it directly, in which case you should remove the Docker step and make sure that it's running on the port that the script expects. 91 | - the Python `requirements.txt` files has specific package version numbers. To prevent overriding existing package versions, you may want to set up a virtual environment to install into. 92 | 93 | 94 | -------------------------------------------------------------------------------- /setup/wiki/actor_contrastive_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import pickle 4 | from tqdm import tqdm 5 | import pylcs 6 | import random 7 | from itertools import combinations 8 | import itertools 9 | import jsonlines 10 | 11 | files = os.listdir() 12 | versions = [int(re.findall("dict_(\d+)\.", i)[0]) for i in files if re.match("redirect_dict", i)] 13 | 14 | with open(f"redirect_dict_{max(versions)}.0.pkl", "rb") as f: 15 | redirect_dict = pickle.load(f) 16 | 17 | query = "Donald Trump" 18 | 19 | keys = list(redirect_dict.keys()) 20 | sims = [pylcs.lcs_sequence_length(query, i) for i in tqdm(keys) if i] 21 | #np.argmax(sims, K=5) 22 | np.argpartition(sims, -4)[-4:] 23 | ## Build training data 24 | 25 | 26 | redirect_list = list(redirect_dict.items()) 27 | 28 | item = redirect_list[33333] 29 | 30 | skip_patt = re.compile(r"#|/") 31 | disambig_patt = re.compile(r"\(disambiguation\)") 32 | 33 | def clean_entries(entries): 34 | entries = [i for i in entries if not re.search(skip_patt, i)] 35 | entries = [re.sub(disambig_patt, "", i).strip() for i in entries] 36 | entries = list(set(entries)) 37 | return entries 38 | 39 | pos_data = [] 40 | pos_file = "redirect_sim_pos.jsonl" 41 | for n, item in tqdm(enumerate(redirect_list), total=len(redirect_list)): 42 | entries = [item[0]] + item[1] 43 | entries = clean_entries(entries) 44 | if len(entries) > 50: 45 | entries = random.choices(entries, k=50) 46 | res = list(combinations(entries, 2)) 47 | random.shuffle(res) 48 | pos_data.extend(res[:10]) 49 | if n % 5000 == 0: 50 | #print(n) 51 | if pos_data: 52 | with jsonlines.open(pos_file, "a") as f: 53 | f.write_all(pos_data) 54 | pos_data = [] 55 | if pos_data: 56 | with jsonlines.open(pos_file, "a") as f: 57 | f.write_all(pos_data) 58 | 59 | 60 | 140156 * (4328876 / 18106) 61 | 62 | 63 | neg_data = [] 64 | neg_file = "redirect_sim_neg.jsonl" 65 | for n, item in tqdm(enumerate(redirect_list), total=len(redirect_list)): 66 | entries = [item[0]] + item[1] 67 | entries = clean_entries(entries) 68 | for entry in entries: 69 | other_item = random.choice(redirect_list) 70 | # avoid the vanishing rare chance of sampling the same entry 71 | if other_item[0] == item[0]: 72 | other_item = random.choice(redirect_list) 73 | other_entries = [other_item[0]] + other_item[1] 74 | other_entries = clean_entries(other_entries) 75 | if other_entries: 76 | neg_sample = random.choice(other_entries) 77 | neg_data.append((entry, neg_sample)) 78 | if n % 5000 == 0: 79 | #print(n) 80 | if neg_data: 81 | with jsonlines.open(neg_file, "a") as f: 82 | f.write_all(neg_data) 83 | neg_data = [] 84 | if neg_data: 85 | with jsonlines.open(neg_file, "a") as f: 86 | f.write_all(neg_data) 87 | 88 | 89 | ## Wiki version 90 | 91 | conn = setup_es() 92 | 93 | box_types = ['officeholder', 'settlement', 'official post', 'company', 94 | 'war faction', 'government agency', 'military unit', 'person', 95 | 'aircraft begin', 'ship begin', 'weapon', 'military person', 96 | 'politician', 'Minister', 'criminal', 'company'] 97 | 98 | 'honorific-prefix' 99 | 100 | box_type = box_types[0] 101 | q = {"multi_match": {"query": box_type, 102 | "fields": ['box_type'], 103 | "type" : "phrase"} 104 | } 105 | res = conn.query(q)[0:40].execute() 106 | results = [i.to_dict()['_source'] for i in res['hits']['hits']] 107 | 108 | for page in results: 109 | 110 | 111 | page = results[4] 112 | 113 | def page_to_entries(page): 114 | entries = [page['title']] + page['redirects'] + page['alternative_names'] 115 | if 'infobox' in page.keys(): 116 | if 'name' in page['infobox'].keys(): 117 | entries.append(page['infobox']['name']) 118 | if 'honorific-suffix' in page['infobox'].keys(): 119 | nn = page['infobox']['name'] + " " + page['infobox']['honorific-suffix'] 120 | entries.append(nn) 121 | if 'honorific-prefix' in page['infobox'].keys(): 122 | nn = page['infobox']['honorific-prefix'] + " " + page['infobox']['name'] 123 | entries.append(nn) 124 | if 'office' in page['infobox'].keys(): 125 | nn = page['infobox']['office'] + " " + page['infobox']['name'] 126 | entries.append(nn) 127 | if 'office1' in page['infobox'].keys(): 128 | nn = page['infobox']['office1'] + " " + page['infobox']['name'] 129 | entries.append(nn) 130 | if 'rank' in page['infobox'].keys(): 131 | nn = page['infobox']['rank'] + " " + page['infobox']['name'] 132 | entries.append(nn) 133 | entries = clean_entries(entries) 134 | return entries 135 | 136 | 137 | def make_pos_combos(entries, max_pairs=30): 138 | # First, limit to 50 redirects 139 | if len(entries) > 50: 140 | entries = random.choices(entries, k=50) 141 | res = list(combinations(entries, 2)) 142 | random.shuffle(res) 143 | return res[0:max_pairs] 144 | 145 | def get_close_match(query, max_results=3, conn=conn): 146 | q = {"multi_match": {"query": query, 147 | "fields": ['title', 'redirects'], 148 | }} 149 | res = conn.query(q)[0:max_results].execute() 150 | results = [i.to_dict()['_source'] for i in res['hits']['hits']] 151 | results = [i for i in results if i['title'] != query] 152 | return results 153 | 154 | def get_neg_pairs(entries, page): 155 | other_names = [] 156 | close_matches = get_close_match(page['title'], 5) 157 | for cm in close_matches: 158 | nes = page_to_entries(cm) 159 | other_names.extend(nes) 160 | 161 | samp_size = min(len(entries), 2) 162 | if samp_size == 0: 163 | return [] 164 | neg_pairs = [] 165 | for n in other_names: 166 | es = random.sample(entries, samp_size) 167 | for e in es: 168 | neg_pairs.append((e, n)) 169 | return neg_pairs 170 | 171 | all_pos = [] 172 | all_neg = [] 173 | neg_file = "redirect_sim_neg2.jsonl" 174 | pos_file = "redirect_sim_pos2.jsonl" 175 | 176 | box_types = ['officeholder', 'settlement', 'official post', 'company', 177 | 'war faction', 'government agency', 'military unit', 'person', 178 | 'aircraft begin', 'ship begin', 'weapon', 'military person', 179 | 'politician', 'Minister', 'criminal', 'company', 'infobox company', 180 | 'country', 'geopolitical organization'] 181 | 182 | 183 | for box_type in box_types[]: 184 | q = {"multi_match": {"query": box_type, 185 | "fields": ['box_type'], 186 | "type" : "phrase"} 187 | } 188 | res = conn.query(q)[0:10000] 189 | 190 | for i in tqdm(res): 191 | page = i.to_dict() 192 | entries = page_to_entries(page) 193 | pos = make_pos_combos(entries) 194 | neg = get_neg_pairs(entries, page) 195 | 196 | all_pos.extend(pos) 197 | all_neg.extend(neg) 198 | 199 | if len(all_pos) > 5000: 200 | with jsonlines.open(neg_file, "a") as f: 201 | f.write_all(all_neg) 202 | with jsonlines.open(pos_file, "a") as f: 203 | f.write_all(all_pos) 204 | all_pos = [] 205 | all_neg = [] 206 | 207 | #para_terms = ['officer', 'politician', 'diplomat', 'country', 'province', 'municipalities', 'city', 'municipality', 'non-governmental organization']# 208 | para_terms = ["Arab", "Andorra","United Arab Emirates","Afghanistan","Antigua and Barbuda","Anguilla","Albania","Armenia","Angola","Argentina","American Samoa","Austria","Australia","Aruba","Azerbaijan","Bosnia and Herzegovina","Barbados","Bangladesh","Belgium","Burkina Faso","Bulgaria","Bahrain","Burundi","Benin","Saint Barthélemy","Bermuda","Brunei","Bolivia","Brazil","Bahamas","Bhutan","Botswana","Belarus","Belize","Canada","Cocos [Keeling] Islands","Democratic Republic of the Congo","Central African Republic","Congo","Switzerland","Côte d’Ivoire","Cook Islands","Chile","Cameroon","China","Colombia","Costa Rica","Cuba","Cape Verde","Curaçao","Christmas Island","Cyprus","Czech Republic","Germany","Djibouti","Denmark","Dominican Republic","Algeria","Ecuador","Estonia","Egypt","Western Sahara","Eritrea","Spain","Ethiopia","European Union","Finland","Fiji","Falkland Islands","Micronesia","Faroe Islands","France","Gabon","United Kingdom","Grenada","Georgia","Ghana","Gibraltar","Greenland","Gambia","Guinea","Guadeloupe","Equatorial Guinea","Greece","Guatemala","Guam","Guinea-Bissau","Guyana","Hong Kong SAR China","Heard Island and McDonald Islands","Honduras","Croatia","Haiti","Hungary","Indonesia","Ireland","Israel","Isle of Man","India","Iraq","Iran","Iceland","Italy","Jersey","Jamaica","Jordan","Japan","Kenya","Kosovo","Kyrgyzstan","Cambodia","Kiribati","Comoros","Saint Kitts and Nevis","North Korea","South Korea","Kuwait","Cayman Islands","Kazakhstan","Laos","Lebanon","Saint Lucia","Liechtenstein","Sri Lanka","Liberia","Lesotho","Lithuania","Luxembourg","Latvia","Libya","Morocco","Monaco","Moldova","Montenegro","Saint Martin","Madagascar","Marshall Islands","Macedonia","Mali","Myanmar [Burma]","Mongolia","Mauritania","Montserrat","Malta","Mauritius","Maldives","Malawi","Mexico","Malaysia","Mozambique","Namibia","New Caledonia","Niger","Norfolk Island","Nigeria","Nicaragua","Netherlands","Norway","Nepal","Nauru","Niue","New Zealand","Oman","Panama","Peru","French Polynesia","Papua New Guinea","Philippines","Pakistan","Poland","Pitcairn Islands","Puerto Rico","Palestinian Territories","Portugal","Palau","Paraguay","Qatar","Romania","Serbia","Russia","Rwanda","Saudi Arabia","Solomon Islands","Seychelles","Sudan","Sweden","Singapore","Saint Helena","Slovenia","Slovakia","Sierra Leone","San Marino","Senegal","Somalia","Suriname","São Tomé and Príncipe","El Salvador","Syria","Swaziland","Turks and Caicos Islands","Chad","Togo","Thailand","Tajikistan","Tokelau","Timor-Leste","Turkmenistan","Tunisia","Tonga","Turkey","Trinidad and Tobago","Tuvalu","Taiwan","Tanzania","Ukraine","Uganda","United Nations","United States","Uruguay","Uzbekistan","Vatican City","Saint Vincent and the Grenadines","Venezuela","British Virgin Islands","U.S. Virgin Islands","Vietnam","Vanuatu","Wallis and Futuna","Samoa","Yemen","South Africa","Zambia","Zimbabwe"] 209 | for para_term in tqdm(para_terms): 210 | q = {"multi_match": {"query": para_term, 211 | "fields": ['intro_para'] 212 | }} 213 | res = conn.query(q)[0:10000] 214 | for i in tqdm(res, leave=False, total=10000): 215 | page = i.to_dict() 216 | entries = page_to_entries(page) 217 | pos = make_pos_combos(entries) 218 | neg = get_neg_pairs(entries, page) 219 | 220 | all_pos.extend(pos) 221 | all_neg.extend(neg) 222 | 223 | if len(all_pos) > 5000: 224 | with jsonlines.open(neg_file, "a") as f: 225 | f.write_all(all_neg) 226 | with jsonlines.open(pos_file, "a") as f: 227 | f.write_all(all_pos) 228 | all_pos = [] 229 | all_neg = [] 230 | 231 | with jsonlines.open(neg_file, "a") as f: 232 | f.write_all(all_neg) 233 | with jsonlines.open(pos_file, "a") as f: 234 | f.write_all(all_pos) 235 | 236 | q = {"multi_match": {"query": "officer", 237 | "fields": ['intro_para'], 238 | }} -------------------------------------------------------------------------------- /setup/wiki/create_index.sh: -------------------------------------------------------------------------------- 1 | echo "Installing Redis..." 2 | sudo apt-get install redis-server 3 | 4 | echo "Starting Docker container and data volume..." 5 | # create the directory first to avoid permission issues when Docker is running as root 6 | mkdir $PWD/wiki_index/ 7 | docker run -d -p 127.0.0.1:9200:9200 -e "discovery.type=single-node" -v $PWD/wiki_index/:/usr/share/elasticsearch/data elasticsearch:7.10.1 8 | 9 | echo "Downloading Wikipedia..." 10 | wget "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2" 11 | 12 | echo "Creating mappings for the fields in the Wikipedia index..." 13 | curl -XPUT 'localhost:9200/wiki' -H 'Content-Type: application/json' -d @wiki_mapping.json 14 | 15 | echo "Change disk availability limits..." 16 | curl -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d' 17 | { 18 | "transient": { 19 | "cluster.routing.allocation.disk.watermark.low": "10gb", 20 | "cluster.routing.allocation.disk.watermark.high": "5gb", 21 | "cluster.routing.allocation.disk.watermark.flood_stage": "4gb", 22 | "cluster.info.update.interval": "1m" 23 | } 24 | } 25 | ' 26 | 27 | echo "\nBuilding redirect links..." 28 | python load_wiki_es.py build_links 29 | 30 | echo "\nBuilding links..." 31 | python load_wiki_es.py load_redis 32 | 33 | echo "\nLoading Wikipedia into Elasticsearch..." 34 | python load_wiki_es.py load_es 35 | 36 | echo "Done" 37 | -------------------------------------------------------------------------------- /setup/wiki/load_wiki_es.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import elasticsearch 3 | from elasticsearch import Elasticsearch, helpers 4 | import mwxml 5 | import mwparserfromhell 6 | import re 7 | from tqdm import tqdm 8 | from textacy.preprocessing.remove import accents as remove_accents 9 | from bz2 import BZ2File as bzopen 10 | import pickle 11 | import plac 12 | import os 13 | import redis 14 | import json 15 | import datetime 16 | 17 | import logging 18 | 19 | logger = logging.getLogger() 20 | handler = logging.FileHandler("wiki_es.log") 21 | formatter = logging.Formatter( 22 | '%(asctime)s %(name)-12s %(levelname)-8s %(message)s') 23 | handler.setFormatter(formatter) 24 | logger.addHandler(handler) 25 | logger.setLevel(logging.DEBUG) 26 | 27 | es_logger = elasticsearch.logger 28 | es_logger.setLevel(elasticsearch.logging.WARNING) 29 | 30 | def get_redirect(page, title=None, text=None): 31 | if not title and not text and page: 32 | text = next(page).text 33 | if not page: 34 | logger.debug("not page") 35 | return None 36 | title = page.title 37 | if not page: 38 | return None 39 | if not text: 40 | return None 41 | 42 | wikicode = mwparserfromhell.parse(str(text)) 43 | 44 | raw_intro = wikicode.get_sections()[0] 45 | intro_para = raw_intro.strip_code() 46 | if re.match("#?(REDIRECT|redirect)", intro_para): 47 | # skip/ignore redirects for now 48 | return None 49 | 50 | 51 | redirect_pattern = re.compile("#?(REDIRECT|redirect|Redirect)") 52 | 53 | def get_page_redirect(page, title, text): 54 | """Returns (original page, new page to redirect to)""" 55 | wikicode = mwparserfromhell.parse(text) 56 | raw_intro = wikicode.get_sections()[0] 57 | if re.match(redirect_pattern, str(raw_intro)): 58 | new_page = re.findall(r"\[\[(.+?)\]\]", str(raw_intro)) 59 | try: 60 | new_page = new_page[0] 61 | except: 62 | return None 63 | # Too many false positives come from this splitting. Keep as-is instead, even 64 | # if that means it won't get added to any articles. 65 | #new_page = new_page.split("#")[0] 66 | return (str(title), str(new_page)) 67 | 68 | 69 | def clean_names(name_list): 70 | if not name_list: 71 | return [] 72 | name_list = [re.sub("\|.+?\]\]", "", i).strip() for i in name_list] 73 | name_list = [re.sub("\[|\]", "", i).strip() for i in name_list] 74 | # There are some weird entries here like "son:" 75 | name_list = [i for i in name_list if not i.endswith(":")] 76 | de_accent = [remove_accents(i) for i in name_list] 77 | name_list = name_list + de_accent 78 | name_list = list(set(name_list)) 79 | return name_list 80 | 81 | 82 | def parse_wiki_article(page, title=None, text=None, use_redis=True): 83 | """ 84 | Go through a Wikipedia dump and format the article so it's useful for us. 85 | 86 | Pull out the article's: 87 | - title 88 | - short desc: (new!) it's similar to the Wikidata short description 89 | - first para 90 | - redirects (from Redis) 91 | - alternative names (anything bold in the first para) 92 | - info box 93 | """ 94 | # These had errors earlier: pull them out separately for inspection. 95 | if title in ['Kyle Rittenhouse', 'Dmitry Peskov', 'Warsaw', 'Brasília', 'Beirut', 'Muhammadu Buhari', 96 | 'Anil Deshmukh', 'Viktor Orbán']: 97 | print(f"found article: {title}") 98 | with open(f"error_articles/list/{title}.txt", "w") as f: 99 | f.write(text) 100 | if not title and not text and page: 101 | if not page: 102 | logger.debug(f"not page: {title}") 103 | return None 104 | text = next(page).text 105 | title = page.title 106 | if not text: 107 | logger.debug(f"No text for {title}") 108 | return None 109 | 110 | # There are a whole bunch of article types that we want to skip 111 | if title.endswith(".jpg") or title.endswith(".png"): 112 | logger.debug(f"Skipping image: {title}") 113 | return None 114 | if re.search("\-stub", title): 115 | logger.debug(f"Skipping Stub: {title}") 116 | return None 117 | if re.match("(User|Selected anniversaries)", title): 118 | logger.debug(f"Skipping User: {title}") 119 | return None 120 | if re.search("\([Dd]isambiguation\)", title): 121 | logger.debug(f"Skipping Disambig: {title}") 122 | return None 123 | if re.search("Articles for deletion", title): 124 | logger.debug(f"Skipping For deletion: {title}") 125 | return None 126 | if re.match("List ", title): 127 | logger.debug(f"Skipping List: {title}") 128 | return None 129 | if re.match("Portal ", title): 130 | logger.debug(f"Skipping Portal: {title}") 131 | return None 132 | if re.search("Today's featured article", title): 133 | logger.debug(f"Skipping featured article: {title}") 134 | return None 135 | if re.search("Featured article candidates", title): 136 | logger.debug(f"Skipping featured article candidate: {title}") 137 | return None 138 | if title.startswith("Peer review/"): 139 | logger.debug(f"Skipping peer review article: {title}") 140 | return None 141 | if title.startswith("Requests for adminship/"): 142 | logger.debug(f"Skipping adminship: {title}") 143 | return None 144 | if title.startswith("Featured list candidates/"): 145 | logger.debug(f"Skipping list candidates: {title}") 146 | return None 147 | if title.startswith("Sockpuppet investigations/"): 148 | logger.debug(f"Skipping sockpuppt: {title}") 149 | return None 150 | # clean up intro para? [[File:Luhansk raions eng.svg|thumb|100px|Raions of Luhansk]] 151 | # also delete the leftover alt names parentheses? 152 | # "[[File:Luhansk raions eng.svg|thumb|100px|Raions of Luhansk]]\nLuhansk,(, ; , , , ; , ), also known as Lugansk and formerly known as Voroshilovgrad (1935-1958)" 153 | 154 | wikicode = mwparserfromhell.parse(str(text)) 155 | 156 | raw_intro = wikicode.get_sections()[0] 157 | intro_para_raw = raw_intro.strip_code() 158 | # strip out the occasional stuff that slips through 159 | intro_para = re.sub("(\[\[.+?\]\])", "", intro_para_raw).strip() 160 | # delete thumbs (not removed by strip_code()): 161 | intro_para = re.sub("^thumb\|.+?\n", "", intro_para) 162 | # do it again, the lazy way 163 | intro_para = re.sub("^thumb\|.+?\n", "", intro_para) 164 | # delete the first set of paratheses 165 | intro_para = re.sub("\(.+?\)", "", intro_para, 1) 166 | if not intro_para: 167 | logger.debug(f"No intro para for {title}.") 168 | #logger.debug(f"{wikicode.get_sections()[:2]}") 169 | return None 170 | if re.match("#?(REDIRECT|redirect|Redirect)", intro_para): 171 | logger.debug(f"Detected redirect in first para: {title}") 172 | # skip/ignore redirects for now 173 | return None 174 | if re.search("\*?\n?Category\:", intro_para): 175 | logger.debug(f"Category: {title}") 176 | return None 177 | if intro_para.startswith("Category:"): 178 | logger.debug(f"Category: {title}") 179 | return None 180 | if intro_para.startswith(""): 181 | logger.debug(f"Sneaky category? {title}") 182 | return None 183 | if re.search("may refer to", intro_para[0:100]): 184 | logger.debug(f"may refer to: {title}") 185 | return None 186 | if re.search("most often refers", intro_para[0:100]): 187 | logger.debug(f"most often refers: {title}") 188 | return None 189 | if re.search("most commonly refers", intro_para[0:100]): 190 | logger.debug(f"most commonly refers: {title}") 191 | return None 192 | if re.search("[Pp]ortal\:", intro_para[0:100]): 193 | logger.debug(f"Portal: {title}") 194 | return None 195 | alternative_names = re.findall("'''(.+?)'''", str(raw_intro)) 196 | 197 | redirects = [] 198 | if use_redis: 199 | redis_db = redis.StrictRedis(host="localhost", port=6379, db=0, charset="utf-8", decode_responses=True) 200 | redirects = redis_db.get(title) 201 | if redirects: 202 | redirects = redirects.split(";") 203 | 204 | if re.match("Categories for", title): 205 | return None 206 | 207 | try: 208 | short_desc = re.findall("\{\{[Ss]hort description\|(.+?)\}\}", str(raw_intro))[0].strip() 209 | except: 210 | logger.debug(f"Error getting short desc for {title}") 211 | #title_mod = re.sub("/", "_", title) 212 | #with open(f"error_articles/short_desc/{title_mod}.txt", "w") as f: 213 | # f.write(str(raw_intro)) 214 | short_desc = "" 215 | 216 | 217 | params = {"title": title, 218 | "short_desc": short_desc, 219 | "intro_para": intro_para.strip(), 220 | "alternative_names": clean_names(alternative_names), 221 | "redirects": clean_names(redirects), 222 | "affiliated_people": [], 223 | "box_type": None} 224 | 225 | for template in wikicode.get_sections()[0].filter_templates(): 226 | if re.search("[Ii]nfobox", template.name.strip()): 227 | # do it this way to prevent overwriting 228 | info_box = {p.name.strip(): p.value.strip_code().strip() for p in template.params} 229 | params['infobox'] = info_box 230 | params['box_type'] = re.sub("Infobox", "", str(template.name)).strip() 231 | break 232 | 233 | if 'infobox' in params.keys(): 234 | for k in ['name', 'native_name', 'other_name', 'alias', 'birth_name', 'nickname', 'other_names']: 235 | if k in params['infobox'].keys(): 236 | newline_alt = [i.strip() for i in params['infobox'][k].split("\n") if i.strip()] 237 | new_alt = [j.strip() for i in newline_alt for j in i.split(",")] 238 | params['alternative_names'].extend(new_alt) 239 | 240 | affiliated_people = [] 241 | for k in ['leaders', 'founded_by', 'founder']: 242 | if k in params['infobox'].keys(): 243 | aff_people = [i.strip() for i in params['infobox'][k].split("\n") if i.strip()] 244 | aff_people = [j.strip() for i in aff_people for j in i.split(",")] 245 | affiliated_people.extend(aff_people) 246 | 247 | params['affiliated_people'] = clean_names(affiliated_people) 248 | params['alternative_names'] = clean_names(params['alternative_names']) 249 | 250 | 251 | raw_categories = wikicode.get_sections()[-1].strip_code() 252 | categories = re.findall("Category:(.+?)\n", raw_categories) 253 | params['categories'] = categories 254 | 255 | if 'infobox' in params.keys(): 256 | for k in ['map']: 257 | if k in params['infobox'].keys(): 258 | del params['infobox'][k] 259 | 260 | params['update'] = datetime.date.today().isoformat() 261 | logger.debug(f"Good article: {title}") 262 | 263 | if title in ['Kyle Rittenhouse', 'Dmitry Peskov', 'Warsaw', 'Brasília', 'Beirut', 'Muhammadu Buhari', 264 | 'Anil Deshmukh', 'Viktor Orbán']: 265 | with open(f"error_articles/list/{title}.json", "w") as f: 266 | json.dump(params, f) 267 | return params 268 | 269 | def wrapper_loader(title, text, page=None): 270 | res = parse_wiki_article(page, title, text) 271 | if not res: 272 | return None 273 | action = {"_index" : "wiki", 274 | #"_id" : res['title'], # it turns out the titles aren't globally unique, so can't use as an ID 275 | "_source" : res} 276 | return action 277 | 278 | 279 | def load_batch_es(page_batch, p, es): 280 | actions = [p.apply_async(wrapper_loader, (title, text)) for title, text in page_batch if title] 281 | actions = [i.get() for i in tqdm(actions, leave=False) if i] 282 | actions = [i for i in actions if i] 283 | try: 284 | helpers.bulk(es, actions, chunk_size=-1, raise_on_error=False) 285 | logger.info("Bulk loading success") 286 | except Exception as e: 287 | logger.info(f"Error in loading Wiki batch!!: {e}. Loading stories individually...") 288 | for i in actions: 289 | try: 290 | response = helpers.bulk(es, i, chunk_size=-1, raise_on_error=False) 291 | if response[1]: 292 | logger.info(f"Error on loading story {i}: {response[1]}") 293 | except Exception as e: 294 | logger.info(f"Skipping single Wiki story {e}") 295 | 296 | 297 | 298 | def redirect_wrapper(title, text): 299 | redir = get_page_redirect(None, title, text) 300 | if redir: 301 | if redir[1] not in redirect_dict.keys(): 302 | redirect_dict[redir[1]] = [redir[0]] 303 | else: 304 | redirect_dict[redir[1]] = list(set(redirect_dict[redir[1]] + [redir[0]])) 305 | 306 | 307 | def read_clean_redirects(): 308 | files = os.listdir() 309 | versions = [int(re.findall("dict_(\d+)\.", i)[0]) for i in files if re.match("redirect_dict", i)] 310 | max_file = f"redirect_dict_{max(versions)}.0.pkl" 311 | logger.info(f"Loading {max_file} into redis") 312 | with open(max_file, "rb") as f: 313 | redirect_dict = pickle.load(f) 314 | 315 | # Merge lowercase versions of keys with their non-lowercase 316 | #len = 1132887 317 | del_list = [] 318 | for k in redirect_dict.keys(): 319 | if k.lower() in redirect_dict.keys(): 320 | redirect_dict[k] = redirect_dict[k] + redirect_dict[k.lower()] 321 | del_list.append(k.lower()) 322 | 323 | for d in del_list: 324 | if d in redirect_dict.keys(): 325 | del redirect_dict[d] 326 | # len = 1106119 327 | return redirect_dict 328 | 329 | 330 | @plac.pos('process', "Which process to run?", choices=['build_links', 'load_redis', 'load_es']) 331 | @plac.pos('file', "Wikiepdia dump location") 332 | @plac.pos('es_batch', "Elasticsearch batch size") 333 | @plac.pos('threads', "number of threads to use") 334 | def process(process, file="enwiki-latest-pages-articles.xml.bz2", es_batch=5000, threads=10): 335 | p = multiprocessing.Pool(threads) 336 | logger.info(f"Reading from {file}") 337 | if re.search("bz2", file): 338 | dump = mwxml.Dump.from_file(bzopen(file, "r")) 339 | else: 340 | dump = mwxml.Dump.from_file(file) 341 | 342 | #dump = mwxml.Dump.from_file(open("Wikipedia-protest-export.xml")) 343 | # 1 core = 11.077 total 344 | # 5 cores = 3.254 total 345 | # 10 cores = 3.075 total 346 | 347 | if process == "build_links": 348 | redirect_dict = {} 349 | logger.info("Building redirect link dictionary...") 350 | page_batch = [] 351 | for n, page in tqdm(enumerate(dump), total=22373694): 352 | if n % 1000000 == 0 and n > 0: 353 | k = n / 1000000 354 | with open(f"redirect_dict_{k}.pkl", "wb") as f: 355 | pickle.dump(redirect_dict, f) 356 | logger.info(f"Dumped at {k} x 1,000,000") 357 | #break 358 | # continue 359 | if page: 360 | page_batch.append((page.title, next(page).text)) 361 | if len(page_batch) % 5000 == 0: 362 | if re.search("bz2", file): 363 | #actions = [p.apply_async(wrapper_loader, (title, text)) for title, text in page_batch if page] 364 | actions = [p.apply_async(get_page_redirect, (None, title, text)) for title, text in page_batch if page] 365 | else: 366 | actions = [p.apply_async(get_page_redirect, (page, None, None)) for page in page_batch if page] 367 | actions = [i.get() for i in tqdm(actions, leave=False) if i] 368 | #actions = [i for i in actions if i] 369 | for redir in tqdm(actions, leave=False): 370 | if not redir: 371 | continue 372 | if redir[1] not in redirect_dict.keys(): 373 | redirect_dict[redir[1]] = [redir[0]] 374 | else: 375 | redirect_dict[redir[1]] = list(set(redirect_dict[redir[1]] + [redir[0]])) 376 | page_batch = [] 377 | # get the final batch 378 | # This one isn't wrapped in a function to make sure redirect_dict stays in the right scope 379 | for redir in tqdm(actions, leave=False): 380 | if not redir: 381 | continue 382 | if redir[1] not in redirect_dict.keys(): 383 | redirect_dict[redir[1]] = [redir[0]] 384 | else: 385 | redirect_dict[redir[1]] = list(set(redirect_dict[redir[1]] + [redir[0]])) 386 | with open(f"redirect_dict_{k+1}.pkl", "wb") as f: 387 | pickle.dump(redirect_dict, f) 388 | 389 | 390 | elif process == "load_redis": 391 | logger.info("Reading redirect dict...") 392 | redirect_dict = read_clean_redirects() 393 | redis_db = redis.StrictRedis(host="localhost", port=6379, db=0) 394 | pipe = redis_db.pipeline() 395 | for n, item in tqdm(enumerate(redirect_dict.items()), total=len(redirect_dict)): 396 | k, v = item 397 | v_str = ";".join(v) 398 | pipe.set(k, v_str) 399 | if n % 1000 == 0: 400 | pipe.execute() 401 | # get the final batch 402 | pipe.execute() 403 | 404 | elif process == "load_es": 405 | logger.info("Loading Wikipedia into Elasticsearch") 406 | es = Elasticsearch(urls='http://localhost:9200/', timeout=60, max_retries=2) 407 | 408 | page_batch = [] 409 | for n, page in tqdm(enumerate(dump), total=21726007): 410 | if page: 411 | page_batch.append((page.title, next(page).text)) 412 | if len(page_batch) % es_batch == 0: 413 | #logger.debug(f"Loaded {page.title}") 414 | load_batch_es(page_batch, p, es) 415 | page_batch = [] 416 | # load final batch 417 | load_batch_es(page_batch, p, es) 418 | 419 | 420 | if __name__ == '__main__': 421 | plac.call(process) 422 | 423 | 424 | -------------------------------------------------------------------------------- /setup/wiki/load_wiki_scratch.py: -------------------------------------------------------------------------------- 1 | file = "enwiki-latest-pages-articles.xml.bz2" 2 | dump = mwxml.Dump.from_file(bzopen(file, "r")) 3 | 4 | results = [] 5 | title_list = ['Anil Deshmukh', 'Mamata Banerjee', 'Sameer Wankhede', 'Brasilia', 'Kyle Rittenhouse', 'Ahmad Massoud', 'Ariel Henry', 'Augusto Aras', 6 | 'Geneva', 'Beirut'] 7 | 8 | for n, page in tqdm(enumerate(dump), total=100): 9 | title = page.title 10 | if title not in title_list: 11 | continue 12 | text = next(page).text 13 | r = parse_wiki_article(page, title, text) 14 | results.append(r) 15 | print(title) 16 | #if n > 100: 17 | # break 18 | 19 | , next(page).text) 20 | parse_wiki_article() 21 | 22 | 23 | 24 | raw = """{{Short description|Ancient Greek city in Anatolia}}\n{{Use dmy dates|date=April 2020}}\n{{Infobox ancient site\n|name = Anazarbus\n|native_name = Anavarza {{in lang|tr}}\n|alternate_name = Caesarea, Justinopolis\n|image = Anavarza_Triumphal_arch_in_Anazarbus_2754.jpg\n|alt = \n|caption = The triumphal arch of Anazarbus was later converted to the city\'s South Gate.\n|map_type = Turkey\n|map_alt = \n|map_size = 270\n|coordinates = {{coord|37|15|50|N|35|54|20|E|display=inline,title}}\n|location = [[Adana Province]], Turkey\n|region = [[Cilicia]]\n|type = Settlement\n|part_of = \n|length = \n|width = \n|area = \n|height = \n|builder = \n|material = \n|built = \n|abandoned = \n|epochs = \n|cultures = \n|dependency_of = \n|occupants = \n|event = \n|excavations = \n|archaeologists = \n|condition = \n|ownership = \n|management = \n|public_access = \n|website = \n|notes = \n}}\n\n[[File:Anazarbe_vue_générale_1.jpg|thumb|right|300px|General view of the site]]\n[[Image:Anazarbus clikya west gate and anvarza castle.JPG|thumb|right|200px|Anazarbus West Gate]]\n\'\'\'Anazarbus \'\'\' ({{lang-grc|Ἀναζαρβός}}, medieval \'\'\'Ain Zarba\'\'\'; modern \'\'\'Anavarza\'\'\'; {{lang-ar|عَيْنُ زَرْبَة}}) was an ancient [[Cilicia]]n city. Under the late Roman Empire, it was the capital of [[Cilicia Secunda]]. [[Roman emperor]] [[Justinian I]] rebuilt the city in 527 after a strong earthquake hit it. It was destroyed in 1374 by the forces of [[Mamluk Empire]], after their conquest of Armenia.\n\n""" 25 | raw = """'{{Short description|Ethnic group in Japan and Russia}}\n{{For|the ethnic group of Western China|Äynu people}}\n{{Use mdy dates|date=April 2020}}\n{{Infobox ethnic group\n| group = Ainu\n| image = File:Ainu Marriage.jpg \n| image_alt = \n| caption = Ainu at a traditional marriage ceremony in [[Hokkaido]].\n| population = {{plainlist|\n* 25,000\n* (Japanese government estimate, 2002)\n* ≥200,000\n* (Unofficial estimate){{cite book|last=Poisson|first=Barbara Aoki|year=2002|title=The Ainu of Japan|publisher=Lerner Publications|location=Minneapolis|page=[https://archive.org/details/ainuofjapan00pois/page/5 5]|isbn=978-0-82254-176-9|url-access=registration|url=https://archive.org/details/ainuofjapan00pois/page/5}}""" 26 | raw = """"{{short description|Political philosophy and movement}}\n{{other uses}}\n{{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}\n{{distinguish|Anarchy}}\n{{pp-semi-indef}}\n{{good article}}\n{{use British English|date=August 2021}}\n{{use dmy dates|date=August 2021}}\n{{anarchism sidebar}}\n{{basic forms of government}}\n'""" 27 | re.findall("\{\{Short description\|(.+?)\}\}", raw)[0] 28 | 29 | 30 | raw = """thumb|Main amethyst-producing countries\n\nAmethyst is a violet variety of quartz. The name comes from the Koine Greek αμέθυστος amethystos from α- a-, "not" and μεθύσκω (Ancient Greek)""" 31 | raw = """\n\nAmethyst is a violet variety of quartz. The name comes from the Koine Greek αμέθυστος amethystos from α- a-, "not" and μεθύσκω (Ancient Greek)""" 32 | re.sub("^thumb\|.+?\n", "", raw) -------------------------------------------------------------------------------- /setup/wiki/requirements.txt: -------------------------------------------------------------------------------- 1 | elasticsearch>+7.16.2,<8.0 2 | elasticsearch-dsl>=7.4.0,<8.0 3 | mwparserfromhell>=0.6.3,<1.0 4 | mwtypes>=0.3.2,<1.0 5 | mwxml>=0.3.3,<1.0 6 | tqdm 7 | textacy>=0.12.0,<1.0 8 | redis>=4.1.0,<5.0 -------------------------------------------------------------------------------- /setup/wiki/wiki_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings" : { 3 | "number_of_shards" : 1, 4 | "number_of_replicas" : 0 5 | }, 6 | "mappings" : { 7 | "properties" : { 8 | "categories" : {"type" : "keyword", "index": "true"}, 9 | "intro_para" : {"type" : "text"}, 10 | "title" : {"type" : "text"}, 11 | "alternative_names" : {"type" : "text", "similarity" : "BM25", 12 | "norms": true}, 13 | "redirects" : {"type" : "text", "similarity" : "boolean", 14 | "norms": true}, 15 | "box_type" : {"type" : "keyword", "index": "true"}, 16 | "infobox": {"type": "flattened"} 17 | } 18 | } 19 | } 20 | --------------------------------------------------------------------------------