├── .gitignore
├── NGEC
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-310.pyc
    │   ├── __init__.cpython-37.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── actor_resolution.cpython-310.pyc
    │   ├── actor_resolution.cpython-37.pyc
    │   ├── actor_resolution.cpython-38.pyc
    │   ├── actor_resolution.cpython-39.pyc
    │   ├── attribute_model.cpython-310.pyc
    │   ├── attribute_model.cpython-37.pyc
    │   ├── attribute_model.cpython-39.pyc
    │   ├── formatter.cpython-310.pyc
    │   ├── formatter.cpython-37.pyc
    │   ├── formatter.cpython-39.pyc
    │   ├── geolocation.cpython-310.pyc
    │   ├── geolocation.cpython-37.pyc
    │   ├── geolocation.cpython-39.pyc
    │   ├── utilities.cpython-310.pyc
    │   └── utilities.cpython-39.pyc
    ├── actor_resolution.py
    ├── assets
    │   ├── PLOVER_agents.hash
    │   ├── PLOVER_agents.txt
    │   ├── README.md
    │   ├── actor_sim_model2
    │   │   ├── 1_Pooling
    │   │   │   └── config.json
    │   │   ├── README.md
    │   │   ├── config.json
    │   │   ├── config_sentence_transformers.json
    │   │   ├── eval
    │   │   │   └── similarity_evaluation_sts-dev_results.csv
    │   │   ├── modules.json
    │   │   ├── pytorch_model.bin
    │   │   ├── sentence_bert_config.json
    │   │   ├── special_tokens_map.json
    │   │   ├── tokenizer.json
    │   │   ├── tokenizer_config.json
    │   │   └── vocab.txt
    │   ├── bert_matrix.pkl
    │   ├── countries.csv
    │   ├── countries.numbers
    │   ├── cow2iso.txt
    │   ├── event_mode_questions.csv
    │   ├── event_models
    │   │   ├── ACCUSE.skops
    │   │   ├── AGREE.skops
    │   │   ├── AID.skops
    │   │   ├── ASSAULT.skops
    │   │   ├── COERCE.skops
    │   │   ├── CONCEDE.skops
    │   │   ├── CONSULT.skops
    │   │   ├── COOPERATE.skops
    │   │   ├── MOBILIZE.skops
    │   │   ├── PROTEST.skops
    │   │   ├── REJECT.skops
    │   │   ├── REQUEST.skops
    │   │   ├── RETREAT.skops
    │   │   ├── SANCTION.skops
    │   │   ├── SUPPORT.skops
    │   │   └── THREATEN.skops
    │   ├── option_model.pt
    │   ├── pattern_matrix.npy
    │   └── pattern_matrix.pkl
    ├── attribute_model.py
    ├── context_class.py
    ├── event_class.py
    ├── formatter.py
    ├── geolocation.py
    ├── mode_class.py
    ├── tests
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-39.pyc
    │   │   ├── conftest.cpython-39-pytest-7.0.1.pyc
    │   │   └── test_actor_resolution.cpython-39-pytest-7.0.1.pyc
    │   ├── conftest.py
    │   ├── test_actor_resolution.py
    │   ├── test_attribute_model.py
    │   ├── test_formatter.py
    │   └── test_multiple_actors.py
    └── utilities.py
├── README.md
├── examples
    ├── Guardian_SDF_sample.csv.zip
    ├── NGEC_pres.pdf
    ├── README.md
    ├── demo_mordecai.py
    └── demo_wiki_resolution.py
├── ngec_process.py
├── ngec_streamlit.py
├── requirements.txt
├── setup.py
└── setup
    ├── README.md
    ├── train_classifiers
        ├── README.md
        ├── fit_event_classifier.py
        ├── generate_synthetic_news.py
        ├── gpt_synthetic_events_2023-10-19_19.csv.zip
        └── synthetic_headlines.csv
    └── wiki
        ├── README.md
        ├── actor_contrastive_data.py
        ├── create_index.sh
        ├── load_wiki_es.py
        ├── load_wiki_scratch.py
        ├── requirements.txt
        └── wiki_mapping.json


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | __pycache__
3 | NGEC/__pycache__*
4 | *.pyc
5 | *.hash
6 | NGEC/assets/bert_matrix.pkl
7 | 


--------------------------------------------------------------------------------
/NGEC/__init__.py:
--------------------------------------------------------------------------------
1 | from .event_class import EventClass
2 | from .actor_resolution import ActorResolver
3 | from .geolocation import GeolocationModel
4 | from .attribute_model import AttributeModel
5 | from .formatter import Formatter
6 | 


--------------------------------------------------------------------------------
/NGEC/__pycache__/__init__.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-310.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/actor_resolution.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-310.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/actor_resolution.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-37.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/actor_resolution.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-38.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/actor_resolution.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/actor_resolution.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/attribute_model.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/attribute_model.cpython-310.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/attribute_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/attribute_model.cpython-37.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/attribute_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/attribute_model.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/formatter.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/formatter.cpython-310.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/formatter.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/formatter.cpython-37.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/formatter.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/formatter.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/geolocation.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/geolocation.cpython-310.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/geolocation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/geolocation.cpython-37.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/geolocation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/geolocation.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/utilities.cpython-310.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/utilities.cpython-310.pyc


--------------------------------------------------------------------------------
/NGEC/__pycache__/utilities.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/__pycache__/utilities.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/assets/PLOVER_agents.hash:
--------------------------------------------------------------------------------
1 | 1359402526471412912


--------------------------------------------------------------------------------
/NGEC/assets/README.md:
--------------------------------------------------------------------------------
1 | - ISO --> COW conversion document. Credit: https://github.com/leops95/cow2iso


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/1_Pooling/config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "word_embedding_dimension": 384,
3 |   "pooling_mode_cls_token": false,
4 |   "pooling_mode_mean_tokens": true,
5 |   "pooling_mode_max_tokens": false,
6 |   "pooling_mode_mean_sqrt_len_tokens": false
7 | }


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/README.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | pipeline_tag: sentence-similarity
  3 | tags:
  4 | - sentence-transformers
  5 | - feature-extraction
  6 | - sentence-similarity
  7 | - transformers
  8 | 
  9 | ---
 10 | 
 11 | # {MODEL_NAME}
 12 | 
 13 | This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
 14 | 
 15 | <!--- Describe your model here -->
 16 | 
 17 | ## Usage (Sentence-Transformers)
 18 | 
 19 | Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
 20 | 
 21 | ```
 22 | pip install -U sentence-transformers
 23 | ```
 24 | 
 25 | Then you can use the model like this:
 26 | 
 27 | ```python
 28 | from sentence_transformers import SentenceTransformer
 29 | sentences = ["This is an example sentence", "Each sentence is converted"]
 30 | 
 31 | model = SentenceTransformer('{MODEL_NAME}')
 32 | embeddings = model.encode(sentences)
 33 | print(embeddings)
 34 | ```
 35 | 
 36 | 
 37 | 
 38 | ## Usage (HuggingFace Transformers)
 39 | Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
 40 | 
 41 | ```python
 42 | from transformers import AutoTokenizer, AutoModel
 43 | import torch
 44 | 
 45 | 
 46 | #Mean Pooling - Take attention mask into account for correct averaging
 47 | def mean_pooling(model_output, attention_mask):
 48 |     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
 49 |     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
 50 |     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 51 | 
 52 | 
 53 | # Sentences we want sentence embeddings for
 54 | sentences = ['This is an example sentence', 'Each sentence is converted']
 55 | 
 56 | # Load model from HuggingFace Hub
 57 | tokenizer = AutoTokenizer.from_pretrained('{MODEL_NAME}')
 58 | model = AutoModel.from_pretrained('{MODEL_NAME}')
 59 | 
 60 | # Tokenize sentences
 61 | encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
 62 | 
 63 | # Compute token embeddings
 64 | with torch.no_grad():
 65 |     model_output = model(**encoded_input)
 66 | 
 67 | # Perform pooling. In this case, mean pooling.
 68 | sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
 69 | 
 70 | print("Sentence embeddings:")
 71 | print(sentence_embeddings)
 72 | ```
 73 | 
 74 | 
 75 | 
 76 | ## Evaluation Results
 77 | 
 78 | <!--- Describe how your model was evaluated -->
 79 | 
 80 | For an automated evaluation of this model, see the *Sentence Embeddings Benchmark*: [https://seb.sbert.net](https://seb.sbert.net?model_name={MODEL_NAME})
 81 | 
 82 | 
 83 | ## Training
 84 | The model was trained with the parameters:
 85 | 
 86 | **DataLoader**:
 87 | 
 88 | `torch.utils.data.dataloader.DataLoader` of length 30311 with parameters:
 89 | ```
 90 | {'batch_size': 128, 'sampler': 'torch.utils.data.sampler.RandomSampler', 'batch_sampler': 'torch.utils.data.sampler.BatchSampler'}
 91 | ```
 92 | 
 93 | **Loss**:
 94 | 
 95 | `sentence_transformers.losses.ContrastiveLoss.ContrastiveLoss` with parameters:
 96 |   ```
 97 |   {'distance_metric': 'SiameseDistanceMetric.COSINE_DISTANCE', 'margin': 0.5, 'size_average': True}
 98 |   ```
 99 | 
100 | Parameters of the fit()-Method:
101 | ```
102 | {
103 |     "epochs": 3,
104 |     "evaluation_steps": 40000,
105 |     "evaluator": "sentence_transformers.evaluation.EmbeddingSimilarityEvaluator.EmbeddingSimilarityEvaluator",
106 |     "max_grad_norm": 1,
107 |     "optimizer_class": "<class 'torch.optim.adamw.AdamW'>",
108 |     "optimizer_params": {
109 |         "lr": 0.0001
110 |     },
111 |     "scheduler": "WarmupLinear",
112 |     "steps_per_epoch": null,
113 |     "warmup_steps": 9094,
114 |     "weight_decay": 0.01
115 | }
116 | ```
117 | 
118 | 
119 | ## Full Model Architecture
120 | ```
121 | SentenceTransformer(
122 |   (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
123 |   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
124 | )
125 | ```
126 | 
127 | ## Citing & Authors
128 | 
129 | <!--- Describe where people can find more information -->


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "/home/gridsan/ahalt/huggingface_models/paraphrase-MiniLM-L3-v2/",
 3 |   "architectures": [
 4 |     "BertModel"
 5 |   ],
 6 |   "attention_probs_dropout_prob": 0.1,
 7 |   "classifier_dropout": null,
 8 |   "gradient_checkpointing": false,
 9 |   "hidden_act": "gelu",
10 |   "hidden_dropout_prob": 0.1,
11 |   "hidden_size": 384,
12 |   "initializer_range": 0.02,
13 |   "intermediate_size": 1536,
14 |   "layer_norm_eps": 1e-12,
15 |   "max_position_embeddings": 512,
16 |   "model_type": "bert",
17 |   "num_attention_heads": 12,
18 |   "num_hidden_layers": 3,
19 |   "pad_token_id": 0,
20 |   "position_embedding_type": "absolute",
21 |   "torch_dtype": "float32",
22 |   "transformers_version": "4.24.0",
23 |   "type_vocab_size": 2,
24 |   "use_cache": true,
25 |   "vocab_size": 30522
26 | }
27 | 


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/config_sentence_transformers.json:
--------------------------------------------------------------------------------
1 | {
2 |   "__version__": {
3 |     "sentence_transformers": "2.0.0",
4 |     "transformers": "4.7.0",
5 |     "pytorch": "1.9.0+cu102"
6 |   }
7 | }


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/eval/similarity_evaluation_sts-dev_results.csv:
--------------------------------------------------------------------------------
1 | epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
2 | 0,-1,0.6109547883709737,0.5786435852182326,0.5838057989492854,0.5366218195030185,0.5832466180054964,0.536238540813903,0.5818760709295144,0.5343506422729409
3 | 1,-1,0.6801764356662623,0.6354600600378237,0.665073766144093,0.6068232368839985,0.664451226644231,0.6063922669643912,0.6804853813087184,0.6128786592142725
4 | 2,-1,0.7036958969271553,0.6537842300050594,0.7021022027187646,0.6338679348386188,0.7013755290609477,0.6335184031333557,0.7054851039552567,0.6324461529849091
5 | 


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/modules.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "idx": 0,
 4 |     "name": "0",
 5 |     "path": "",
 6 |     "type": "sentence_transformers.models.Transformer"
 7 |   },
 8 |   {
 9 |     "idx": 1,
10 |     "name": "1",
11 |     "path": "1_Pooling",
12 |     "type": "sentence_transformers.models.Pooling"
13 |   }
14 | ]


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/pytorch_model.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/actor_sim_model2/pytorch_model.bin


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/sentence_bert_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "max_seq_length": 128,
3 |   "do_lower_case": false
4 | }


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "cls_token": "[CLS]",
3 |   "mask_token": "[MASK]",
4 |   "pad_token": "[PAD]",
5 |   "sep_token": "[SEP]",
6 |   "unk_token": "[UNK]"
7 | }
8 | 


--------------------------------------------------------------------------------
/NGEC/assets/actor_sim_model2/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cls_token": "[CLS]",
 3 |   "do_basic_tokenize": true,
 4 |   "do_lower_case": true,
 5 |   "mask_token": "[MASK]",
 6 |   "model_max_length": 512,
 7 |   "name_or_path": "/home/gridsan/ahalt/huggingface_models/paraphrase-MiniLM-L3-v2/",
 8 |   "never_split": null,
 9 |   "pad_token": "[PAD]",
10 |   "sep_token": "[SEP]",
11 |   "special_tokens_map_file": "/home/gridsan/ahalt/huggingface_models/paraphrase-MiniLM-L3-v2/special_tokens_map.json",
12 |   "strip_accents": null,
13 |   "tokenize_chinese_chars": true,
14 |   "tokenizer_class": "BertTokenizer",
15 |   "unk_token": "[UNK]"
16 | }
17 | 


--------------------------------------------------------------------------------
/NGEC/assets/bert_matrix.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/bert_matrix.pkl


--------------------------------------------------------------------------------
/NGEC/assets/countries.csv:
--------------------------------------------------------------------------------
  1 | CCA2,Name,CCA3,Nationality
  2 | AF,Afghanistan,AFG,"Afghan, Afghani, Afghans, Islamic Republic of Afghanistan"
  3 | AL,Albania,ALB,"Albanian, Albanians"
  4 | DZ,Algeria,DZA,"Algerian, Algerians"
  5 | AD,Andorra,AND,"Andorran, Andorrans"
  6 | AO,Angola,AGO,"Angolan, Angolans"
  7 | AG,Antigua and Barbuda,ATG,"Antiguan, Barbudan, Antiguans, Barbudans"
  8 | AR,Argentina,ARG,"Argentinean, Argentine, Argentineans, Argentines"
  9 | AM,Armenia,ARM,"Armenian, Armenians"
 10 | AU,Australia,AUS,"Australian, Australians,Christmas Island,Christmas Islander, Christmas Islanders,Keeling Islands, Cocos [Keeling] Islands,Cocos Islander,Heard Island and McDonald Island,Heard and McDonald Islander,Norfolk Island,Norfolk Islander"
 11 | AT,Austria,AUT,"Austrian, Austrians"
 12 | AZ,Azerbaijan,AZE,"Azerbaijani, Azeri, Azerbaijanis, Azeris"
 13 | BS,Bahamas,BHS,"Bahamian, Bahamian"
 14 | BH,Bahrain,BHR,"Bahraini, Bahrainis"
 15 | BD,Bangladesh,BGD,"Bangladeshi, Bangladeshis"
 16 | BB,Barbados,BRB,"Barbadian, Barbadians"
 17 | BY,Belarus,BLR,"Belarusian, Belarusians"
 18 | BE,Belgium,BEL,"Belgian, Belgians"
 19 | BZ,Belize,BLZ,"Belizean, Belizeans"
 20 | BJ,Benin,BEN,Beninese
 21 | BT,Bhutan,BTN,Bhutanese
 22 | BO,Bolivia,BOL,"Bolivian, Bolivians"
 23 | BA,Bosnia and Herzegovina,BIH,"Bosnian, Bosniak,Herzegovinian,Bosnians,Bosniaks,Herzegovinians"
 24 | BW,Botswana,BWA,"Motswana, Botswanan"
 25 | BR,Brazil,BRA,"Brazilian, Brazilians"
 26 | BN,Brunei,BRN,"Bruneian, Bruneians"
 27 | BG,Bulgaria,BGR,"Bulgarian, Bulgarians"
 28 | BF,Burkina Faso,BFA,Burkinabe
 29 | BI,Burundi,BDI,"Burundian, Burundians"
 30 | KH,Cambodia,KHM,"Cambodian, Cambodians"
 31 | CM,Cameroon,CMR,"Cameroonian, Cameroonians"
 32 | CA,Canada,CAN,"Canadian, Canadians"
 33 | CV,Cabo Verde,CPV,"Cape Verdian,Cabo Verde,Cape Verde"
 34 | CF,Central African Republic,CAF,"Central African, Central Africans"
 35 | TD,Chad,TCD,"Chadian,Chadians"
 36 | CL,Chile,CHL,"Chilean, Chileans"
 37 | CN,China,CHN,"Chinese, People's Republic of China"
 38 | CO,Colombia,COL,"Colombian, Colombians"
 39 | KM,Comoros,COM,"Comoran, Comorans"
 40 | CG,Congo,COG,"Republic of the Congo,Congolese,Congo-Brazzaville"
 41 | CR,Costa Rica,CRI,"Costa Rican, Costa Ricans"
 42 | CI,Cote d'Ivoire,CIV,"Ivorian, Ivorians, Cote d’Ivoire, Ivory Coast, Côte d’Ivoire"
 43 | HR,Croatia,HRV,"Croatian, Croat, Croatians, Croats"
 44 | CU,Cuba,CUB,"Cuban, Cubans"
 45 | CY,Cyprus,CYP,"Cypriot, Cypriots"
 46 | CZ,Czechia,CZE,"Czech, Czechs,Czech Republic"
 47 | CD,Democratic Republic of the Congo,COD,"DRC,DR Congo,Zaire"
 48 | DK,Denmark,DNK,"Danish, Dane, Danes,Faroe Islands,Faroese,Faroe Islander,Greenland,Greenlandic, Greenlander, Greenlanders"
 49 | DJ,Djibouti,DJI,"Djiboutian, Djiboutians"
 50 | DO,Dominican Republic,DOM,"Dominican, Dominicans, Dominicanos"
 51 | EC,Ecuador,ECU,"Ecuadorean, Ecuadoreans"
 52 | EG,Egypt,EGY,"Egyptian, Egyptians"
 53 | SV,El Salvador,SLV,"Salvadoran,Salvadorans"
 54 | GQ,Equatorial Guinea,GNQ,"Equatorial Guinean, Equatorial Guineans"
 55 | ER,Eritrea,ERI,"Eritrean, Eritreans"
 56 | EE,Estonia,EST,"Estonian, Estonians"
 57 | ET,Ethiopia,ETH,"Ethiopian, Ethiopians"
 58 | NA,European Union,EUR,"European Union, EU"
 59 | FJ,Fiji,FJI,"Fijian, Fijians"
 60 | FI,Finland,FIN,"Finnish, Finn, Finns,Åland Islands,Åland Islander"
 61 | FR,France,FRA,"French,French Guiana,French Polynesia,French Polynesian,French Polynesians,French Southern Territories,Guadeloupe,Guadeloupian, Guadeloupians,Martinique,Mayotte,New Caledonia,New Caledonian,New Caledonians,Réunion,Saint Barthélemy,Saint Martin,Saint Martin Islander,Saint Martin Islanders,Saint Pierre and Miquelon,Wallis and Futuna,Wallis and Futuna Islander"
 62 | GA,Gabon,GAB,Gabonese
 63 | GM,Gambia,GMB,"Gambian, Gambians"
 64 | GE,Georgia,GEO,"Georgian, Georgians"
 65 | DE,Germany,DEU,"German, Germans"
 66 | GH,Ghana,GHA,"Ghanaian, Ghanaians"
 67 | GR,Greece,GRC,"Greek, Greeks"
 68 | GD,Grenada,GRD,Grenadian
 69 | GT,Guatemala,GTM,"Guatemalan, Guatemalans"
 70 | GN,Guinea,GIN,"Guinean, Guineans"
 71 | GW,Guinea-Bissau,GNB,"Guinea-Bissauan, Guinea-Bissauans"
 72 | GY,Guyana,GUY,Guyanese
 73 | HT,Haiti,HTI,"Haitian, Haitians"
 74 | HN,Honduras,HND,"Honduran, Hondurans"
 75 | HK,Hong Kong SAR China,HKG,"Hong Kong, Hong Konger, Hong Kongers,Hong Kongese"
 76 | HU,Hungary,HUN,"Hungarian, Hungarians"
 77 | IS,Iceland,ISL,"Icelander, Icelanders"
 78 | IN,India,IND,"Indian, Indians"
 79 | ID,Indonesia,IDN,"Indonesian, Indonesians"
 80 | IR,Iran,IRN,"Iranian, Iranians,Islamic Republic of Iran"
 81 | IQ,Iraq,IRQ,"Iraqi, Iraqis"
 82 | IE,Ireland,IRL,Irish
 83 | IL,Israel,ISR,"Israeli, Israelis"
 84 | IT,Italy,ITA,"Italian, Italians"
 85 | JM,Jamaica,JAM,"Jamaican, Jamaicans"
 86 | JP,Japan,JPN,Japanese
 87 | JO,Jordan,JOR,"Jordanian, Jordanians"
 88 | KZ,Kazakhstan,KAZ,"Kazakhstani, Kazakh, Kazakhs"
 89 | KE,Kenya,KEN,"Kenyan, Kenyan"
 90 | KI,Kiribati,KIR,I-Kiribati
 91 | XK,Kosovo,XKX,"Kosovar, Kosovars, Kosovans, Kosovan"
 92 | KW,Kuwait,KWT,"Kuwaiti, Kuwaitis"
 93 | KG,Kyrgyzstan,KGZ,"Kirghiz, Kyrgyz"
 94 | LA,Laos,LAO,"Laotian,Laotians,Lao People's Democratic Republic"
 95 | LV,Latvia,LVA,"Latvian,Latvians"
 96 | LB,Lebanon,LBN,Lebanese
 97 | LS,Lesotho,LSO,"Mosotho,Mosothos"
 98 | LR,Liberia,LBR,"Liberian,Liberians"
 99 | LY,Libya,LBY,"Libyan,Libyans"
100 | LI,Liechtenstein,LIE,"Liechtensteiner, Liechtensteiners"
101 | LT,Lithuania,LTU,"Lithuanian,Lithuanians"
102 | LU,Luxembourg,LUX,"Luxembourger,Luxembourgers"
103 | MK,North Macedonia,MKD,"Macedonia,Macedonian,Macedonians,FYROM"
104 | MG,Madagascar,MDG,Malagasy
105 | MW,Malawi,MWI,"Malawian,Malawians,Malawis"
106 | MY,Malaysia,MYS,"Malaysian,Malaysians"
107 | MV,Maldives,MDV,"Maldivan,Maldivans"
108 | ML,Mali,MLI,"Malian,Malians"
109 | MT,Malta,MLT,Maltese
110 | MH,Marshall Islands,MHL,Marshallese
111 | MR,Mauritania,MRT,"Mauritanian,Mauritanians"
112 | MU,Mauritius,MUS,Mauritian
113 | MX,Mexico,MEX,"Mexican,Mexicans"
114 | FM,Micronesia,FSM,"Micronesian, Micronesians"
115 | MD,Moldova,MDA,"Moldovan,Moldovans"
116 | MC,Monaco,MCO,"Monegasque,Monegasques"
117 | MN,Mongolia,MNG,"Mongolian,Mongolians"
118 | ME,Montenegro,MNE,"Montenegrin,Montenegrins"
119 | MA,Morocco,MAR,"Moroccan,Moroccans,Western Sahara,Sahrawi"
120 | MZ,Mozambique,MOZ,"Mozambican,Mozambicans"
121 | MM,Myanmar [Burma],MMR,"Myanmar, Burmese"
122 | NA,Namibia,NAM,"Namibian,Namibians"
123 | NR,Nauru,NRU,Nauruan
124 | NP,Nepal,NPL,Nepalese
125 | NL,Netherlands,NLD,"Dutch,Aruba,Aruban,Arubans,Bonaire, Sint Eustatius and Saba, Curaçao, Curaçaoan,Curacao,Curacaoan,Sint Maarten"
126 | NZ,New Zealand,NZL,"New Zealander, Kiwi,New Zealanders, Kiwis,Niue,Niuean,Niuean,Tokelau,Tokelauan,Cook Islands, Cook Islander"
127 | NI,Nicaragua,NIC,"Nicaraguan,Nicaraguans"
128 | NE,Niger,NER,"Nigeran,Nigerans"
129 | NG,Nigeria,NGA,"Nigerian,Nigerians"
130 | KP,North Korea,PRK,"North Korean, North Koreans,DPRK"
131 | NO,Norway,NOR,"Norwegian,Norwegians,Bouvet Island,Bouvet Islanders,Svalbard and Jan Mayen,Svalbard"
132 | OM,Oman,OMN,"Omani,Omanis"
133 | PK,Pakistan,PAK,"Pakistani,Pakistanis"
134 | PW,Palau,PLW,"Palauan,Palauans"
135 | PS,Palestinian Territories,PSE,"Palestinian,Palestine,Palestinians"
136 | PA,Panama,PAN,"Panamanian,Panamanians"
137 | PG,Papua New Guinea,PNG,"Papua New Guinean,Papua New Guineans"
138 | PY,Paraguay,PRY,"Paraguayan,Paraguayans"
139 | PE,Peru,PER,"Peruvian,Peruvians"
140 | PH,Philippines,PHL,"Filipino,Filipinos"
141 | PL,Poland,POL,"Polish,Poles"
142 | PT,Portugal,PRT,Portuguese
143 | QA,Qatar,QAT,"Qatari,Qataris"
144 | RO,Romania,ROU,"Romanian,Romanians"
145 | RU,Russia,RUS,"Russian,Russians"
146 | RW,Rwanda,RWA,"Rwandan,Rwandans"
147 | KN,Saint Kitts and Nevis,KNA,Kittian and Nevisian
148 | LC,Saint Lucia,LCA,"Saint Lucian, Saint Lucians"
149 | VC,Saint Vincent and the Grenadines,VCT,Saint Vincentian
150 | WS,Samoa,WSM,"Samoan,Samoans"
151 | SM,San Marino,SMR,Sammarinese
152 | ST,Sao Tome and Principe,STP,"Sao Tomean,Sao Tomeans,São Tomé and Príncipe"
153 | SA,Saudi Arabia,SAU,"Saudi Arabia,Saudi Arabian,Saudi,Saudi Arabians,Saudis"
154 | SN,Senegal,SEN,Senegalese
155 | RS,Serbia,SRB,"Serbian,Serb,Serbs"
156 | SC,Seychelles,SYC,Seychellois
157 | SL,Sierra Leone,SLE,"Sierra Leonean,Sierra Leoneans"
158 | SG,Singapore,SGP,"Singaporean,Singaporeans"
159 | SK,Slovakia,SVK,"Slovak,Slovaks"
160 | SI,Slovenia,SVN,"Slovene,Slovenes"
161 | SB,Solomon Islands,SLB,"Solomon Islander,Solomon Islanders"
162 | SO,Somalia,SOM,"Somali,Somalis"
163 | ZA,South Africa,ZAF,"South African,South Africans"
164 | KR,South Korea,KOR,"South Korean, South Koreans"
165 | ES,Spain,ESP,"Spanish, Spaniard, Spaniards"
166 | LK,Sri Lanka,LKA,"Sri Lankan, Sri Lankans"
167 | SD,Sudan,SDN,Sudanese
168 | SR,Suriname,SUR,Surinamer
169 | SZ,Eswatini,SWZ,"Swazi,Swazis,Swaziland"
170 | SE,Sweden,SWE,"Swedish,Swede,Swedes"
171 | CH,Switzerland,CHE,Swiss
172 | SY,Syria,SYR,"Syrian,Syrians,Syrian Arab Republic"
173 | TW,Taiwan,TWN,"Taiwanese,Republic of China"
174 | TJ,Tajikistan,TJK,"Tadzhik,Tajik,Tajiks"
175 | TZ,Tanzania,TZA,"Tanzanian, Tanzanians, United Republic of Tanzania"
176 | TH,Thailand,THA,"Thai,Thais"
177 | TL,Timor-Leste,TLS,"East Timorese, East Timor"
178 | TG,Togo,TGO,Togolese
179 | TO,Tonga,TON,"Tongan,Tongan"
180 | TT,Trinidad and Tobago,TTO,"Trinidadian,Trinidadians"
181 | TN,Tunisia,TUN,"Tunisian,Tunisians"
182 | TR,Turkey,TUR,"Turkish,Turk,Turks"
183 | TM,Turkmenistan,TKM,"Turkmen,Turkmeni,Turkmens,Turkmenis"
184 | TV,Tuvalu,TUV,"Tuvaluan,Tuvaluans"
185 | UG,Uganda,UGA,"Ugandan, Ugandans"
186 | UA,Ukraine,UKR,"Ukrainian, Ukrainians"
187 | AE,United Arab Emirates,ARE,"Emirati, Emiratis"
188 | GB,United Kingdom,GBR,"British, English, England, UK, U.K., United Kingdom of Great Britain and Northern Ireland, Britain, Brits, Scottish, Welsh, Scotland, Wales, Great Britain, Anguilla,Anguillian, Anguillians,Bermuda,Bermudian, Bermudians,British Indian Ocean Territory,Cayman Islands,Cayman Islander, Caymanians,Caymanian,Falkland Islands,Falkland Islander,Falkland Islanders,Falklander,Falklanders,Falkland Islands (Malvinas),Islas Malvinas,Gibraltar,Gibraltar, Gibraltarian, Gibraltarians,Guernsey,Isle of Man,Manx,Channel Islander, Channel Islanders,Montserrat,Montserratian,Pitcairn Islands,Pitcairn Islander,Pitcairn Islanders,Saint Helena,Saint Helenian,Saint Helenians,South Georgia and the South Sandwich Islands,Turks and Caicos Islands,Turks and Caicos Islander,British Virgin Islands,British Virgin Island"
189 | NA,United Nations,UNO,"United Nations, UN, U.N."
190 | US,United States,USA,"American, US, U.S., United States,Americans,American Samoa,American Samoan,American Samoans,Guam,Guamanian, Guamanians,Northern Mariana Islands,Puerto Rico,Puerto Rican,Puerto Ricans,United States Minor Outlying Islands,Virgin Islands (U.S.), U.S. Virgin Islands,US Virgin Islands"
191 | UY,Uruguay,URY,"Uruguayan, Uruguayans"
192 | UZ,Uzbekistan,UZB,"Uzbekistani, Uzbekistanis"
193 | VU,Vanuatu,VUT,Ni-Vanuatu
194 | VA,Vatican City,VAT,Vatican City
195 | VE,Venezuela,VEN,"Venezuelan,Venezuelans"
196 | VN,Vietnam,VNM,Vietnamese
197 | YE,Yemen,YEM,"Yemeni,Yemenis"
198 | ZM,Zambia,ZMB,"Zambian,Zambians"
199 | ZW,Zimbabwe,ZWE,"Zimbabwean,Zimbabweans"
200 | ,South Ossetia,GEOPRE,South Ossetian
201 | ,Abkhazia,GEOPRE,"Republic of Abkhazia, Abkhaz"
202 | ,Transnistria,MDAPRE,Transnistrian
203 | ,Luhansk People's Republic,UKRPRE,Luhansk
204 | ,Donetsk People's Republic,UKRPRE,Donetsk
205 | ,Nagorno-Karabahk,AZEPRE,Republic of Artsakh
206 | ,Somaliland,SOMPRE,Somalilander
207 | ,Northern Cyprus,CYPPRE,North Cypriot


--------------------------------------------------------------------------------
/NGEC/assets/countries.numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/countries.numbers


--------------------------------------------------------------------------------
/NGEC/assets/cow2iso.txt:
--------------------------------------------------------------------------------
  1 | cow_id,cow3,iso_id,iso2,iso3,valid_from,valid_until,cname,cname_full,comments,statenme
  2 | 2,USA,841,US,USA,1962,1980,USA (before 1981),USA and Puerto Rico,Including Puerto Rico,United States of America
  3 | 2,USA,842,US,USA,1981,,USA,"USA, Puerto Rico and US Virgin Islands",Including Puerto Rico and US Virgin Islands,United States of America
  4 | 20,CAN,124,CA,CAN,1962,,Canada,Canada,,Canada
  5 | 31,BHM,44,BS,BHS,1962,,Bahamas,Bahamas,,Bahamas
  6 | 40,CUB,192,CU,CUB,1962,,Cuba,Cuba,,Cuba
  7 | 41,HAI,332,HT,HTI,1962,,Haiti,Haiti,,Haiti
  8 | 42,DOM,214,DO,DOM,1962,,Dominican Rep.,Dominican Republic,,Dominican Republic
  9 | 51,JAM,388,JM,JAM,1962,,Jamaica,Jamaica,,Jamaica
 10 | 52,TRI,780,TT,TTO,1962,,Trinidad and Tobago,Trinidad and Tobago,,Trinidad and Tobago
 11 | 53,BAR,52,BB,BRB,1962,,Barbados,Barbados,,Barbados
 12 | 54,DMA,212,DM,DMA,1962,,Dominica,Dominica,,Dominica
 13 | 55,GRN,308,GD,GRD,1962,,Grenada,Grenada,,Grenada
 14 | 56,SLU,662,LC,LCA,1962,,Saint Lucia,Saint Lucia,,St. Lucia
 15 | 57,SVG,670,VC,VCT,1962,,Saint Vincent and the Grenadines,Saint Vincent and the Grenadines,,St. Vincent and the Grenadines
 16 | 58,AAB,28,AG,ATA,1962,,Antigua and Barbuda,Antigua and Barbuda,,Antigua & Barbuda
 17 | 60,SKN,658,KN,KNA,1962,1980,"Saint Kitts, Nevis and Anguilla","Saint Kitts, Nevis and Anguilla",,St. Kitts and Nevis
 18 | 60,SKN,659,KN,KNA,1981,,Saint Kitts and Nevis,Saint Kitts and Nevis,,St. Kitts and Nevis
 19 | 70,MEX,484,MX,MEX,1962,,Mexico,Mexico,,Mexico
 20 | 80,BLZ,84,BZ,BLZ,1962,,Belize,Belize,,Belize
 21 | 90,GUA,320,GT,GTM,1962,,Guatemala,Guatemala,,Guatemala
 22 | 91,HON,340,HN,HND,1962,,Honduras,Honduras,,Honduras
 23 | 92,SAL,222,SV,SLV,1962,,El Salvador,El Salvador,,El Salvador
 24 | 93,NIC,558,NI,NIC,1962,,Nicaragua,Nicaragua,,Nicaragua
 25 | 94,COS,188,CR,CRI,1962,,Costa Rica,Costa Rica,,Costa Rica
 26 | 95,PAN,591,PA,PAN,1978,,Panama,Panama,,Panama
 27 | 95,PAN,590,PA,PAN,1962,1977,"Fmr Panama, excl.Canal Zone","Former Panama, excluding Canal Zone",,Panama
 28 | 100,COL,170,CO,COL,1962,,Colombia,Colombia,,Colombia
 29 | 101,VEN,862,VE,VEN,1962,,Venezuela,Venezuela,,Venezuela
 30 | 110,GUY,328,GY,GUY,1962,,Guyana,Guyana,,Guyana
 31 | 115,SUR,740,SR,SUR,1962,,Suriname,Suriname,,Suriname
 32 | 130,ECU,218,EC,ECU,1962,,Ecuador,Ecuador,,Ecuador
 33 | 135,PER,604,PE,PER,1962,,Peru,Peru,,Peru
 34 | 140,BRA,76,BR,BRA,1962,,Brazil,Brazil,,Brazil
 35 | 145,BOL,68,BO,BOL,1962,,Bolivia (Plurinational State of),Plurinational State of Bolivia,,Bolivia
 36 | 150,PAR,600,PY,PRY,1962,,Paraguay,Paraguay,,Paraguay
 37 | 155,CHL,152,CL,CHL,1962,,Chile,Chile,,Chile
 38 | 160,ARG,32,AR,ARG,1962,,Argentina,Argentina,,Argentina
 39 | 165,URU,858,UY,URY,1962,,Uruguay,Uruguay,,Uruguay
 40 | 200,UKG,826,GB,GBR,1962,,United Kingdom,United Kingdom,,United Kingdom
 41 | 205,IRE,372,IE,IRL,1962,,Ireland,Ireland,,Ireland
 42 | 210,NTH,528,NL,NLD,1962,,Netherlands,Netherlands,,Netherlands
 43 | 211,BEL,58,BE,BEL,1962,1998,Belgium-Luxembourg,Belgium-Luxembourg,,Belgium
 44 | 211,BEL,56,BE,BEL,1999,,Belgium,Belgium,,Belgium
 45 | 212,LUX,442,LU,LUX,1999,,Luxembourg,Luxembourg,,Luxembourg
 46 | 220,FRN,251,FR,FRA,1962,,France,"France, Monaco",Including Monaco,France
 47 | 221,MNC,,,,,,Monaco,,,Monaco
 48 | 223,LIE,,,,,,Liechtenstein,,,Liechtenstein
 49 | 225,SWZ,757,CH,CHE,1962,,Switzerland,"Switzerland, Liechtenstein",Liechtenstein,Switzerland
 50 | 230,SPN,724,ES,ESP,1962,,Spain,Spain,,Spain
 51 | 232,AND,20,AD,AND,1962,,Andorra,Andorra,,Andorra
 52 | 235,POR,620,PT,PRT,1962,,Portugal,Portugal,,Portugal
 53 | 240,HAN,,,,,,Hanover,,,Hanover
 54 | 245,BAV,,,,,,Bavaria,,,Bavaria
 55 | 255,GMY,276,DE,DEU,1991,,Germany,Germany,,Germany
 56 | 260,GFR,280,DE,DEU,1962,1990,Fmr Fed. Rep. of Germany,Former Federal Republic of Germany,,German Federal Republic
 57 | 265,GDR,278,DD,DDR,1962,1990,Fmr Dem. Rep. of Germany,Former Democratic Republic of Germany,,German Democratic Republic
 58 | 267,BAD,,,,,,Baden,,,Baden
 59 | 269,SAX,,,,,,Saxony,,,Saxony
 60 | 271,WRT,,,,,,Wuerttemburg,,,Wuerttemburg
 61 | 273,HSE,,,,,,Hesse Electoral,,,Hesse Electoral
 62 | 275,HSG,,,,,,Hesse Grand Ducal,,,Hesse Grand Ducal
 63 | 280,MEC,,,,,,Mecklenburg Schwerin,,,Mecklenburg Schwerin
 64 | 290,POL,616,PL,POL,1962,,Poland,Poland,,Poland
 65 | 300,AUH,,,,,,Austria-Hungary,,,Austria-Hungary
 66 | 305,AUS,40,AT,AUT,1962,,Austria,Austria,,Austria
 67 | 310,HUN,348,HU,HUN,1962,,Hungary,Hungary,,Hungary
 68 | 315,CZE,200,CS,CSK,1962,1992,Czechoslovakia,Czechoslovakia,,Czechoslovakia
 69 | 316,CZR,203,CZ,CZE,1993,,Czechia,Czechia,,Czech Republic
 70 | 317,SLO,703,SK,SVK,1993,,Slovakia,Slovakia,,Slovakia
 71 | 325,ITA,381,IT,ITA,1962,,Italy,Italy,,Italy
 72 | 327,PAP,,,,,,Papal States,,,Papal States
 73 | 329,SIC,,,,,,Two Sicilies,,,Two Sicilies
 74 | 331,SNM,674,SM,SMR,2000,,San Marino,San Marino,,San Marino
 75 | 332,MOD,,,,,,Modena,,,Modena
 76 | 335,PMA,,,,,,Parma,,,Parma
 77 | 337,TUS,,,,,,Tuscany,,,Tuscany
 78 | 338,MLT,470,MT,MLT,1962,,Malta,Malta,,Malta
 79 | 339,ALB,8,AL,ALB,1962,,Albania,Albania,,Albania
 80 | 341,MNG,499,ME,MNE,2006,,Montenegro,Montenegro,,Montenegro
 81 | 343,,446,MO,MAC,1962,,"China, Macao SAR","China, Macao Special Administrative Region",,Macedonia
 82 | 343,MAC,807,MK,MKD,1993,,TFYR of Macedonia,The Former Yugoslav Republic of Macedonia,,Macedonia
 83 | 344,CRO,191,HR,HRV,1992,,Croatia,Croatia,,Croatia
 84 | 345,YUG,890,YU,YUG,1962,1991,Fmr Yugoslavia,Former Yugoslavia,In 1992 including TFYR of Macedonia,Yugoslavia
 85 | 346,BOS,70,BA,BIH,1992,,Bosnia Herzegovina,Bosnia Herzegovina,,Bosnia and Herzegovina
 86 | 347,KOS,,,,,,Kosovo,,,Kosovo
 87 | 349,SLV,705,SI,SVN,1992,,Slovenia,Slovenia,,Slovenia
 88 | 350,GRC,300,GR,GRC,1962,,Greece,Greece,,Greece
 89 | 352,CYP,196,CY,CYP,1962,,Cyprus,Cyprus,,Cyprus
 90 | 355,BUL,100,BG,BGR,1962,,Bulgaria,Bulgaria,,Bulgaria
 91 | 359,MLD,498,MD,MDA,1992,,Rep. of Moldova,Republic of Moldova,,Moldova
 92 | 360,ROM,642,RO,ROU,1962,,Romania,Romania,,Romania
 93 | 365,RUS,643,RU,RUS,1992,,Russian Federation,Russian Federation,,Russia
 94 | 366,EST,233,EE,EST,1992,,Estonia,Estonia,,Estonia
 95 | 367,LAT,428,LV,LVA,1992,,Latvia,Latvia,,Latvia
 96 | 368,LIT,440,LT,LTU,1992,,Lithuania,Lithuania,,Lithuania
 97 | 369,UKR,804,UA,UKR,1992,,Ukraine,Ukraine,,Ukraine
 98 | 370,BLR,112,BY,BLR,1992,,Belarus,Belarus,,Belarus
 99 | 371,ARM,51,AM,ARM,1992,,Armenia,Armenia,,Armenia
100 | 372,GRG,268,GE,GEO,1992,,Georgia,Georgia,,Georgia
101 | 373,AZE,31,AZ,AZE,1992,,Azerbaijan,Azerbaijan,,Azerbaijan
102 | 375,FIN,246,FI,FIN,1962,,Finland,Finland,,Finland
103 | 380,SWD,752,SE,SWE,1962,,Sweden,Sweden,,Sweden
104 | 385,NOR,579,NO,NOR,1962,,Norway,"Norway, Svalbard and Jan Mayen",Including Svalbard and Jan Mayen,Norway
105 | 390,DEN,208,DK,DNK,1962,,Denmark,Denmark,,Denmark
106 | 395,ICE,352,IS,ISL,1962,,Iceland,Iceland,,Iceland
107 | 402,CAP,132,CV,CPV,1962,,Cabo Verde,Cabo Verde,,Cape Verde
108 | 403,STP,678,ST,STP,1962,,Sao Tome and Principe,Sao Tome and Principe,,Sao Tome and Principe
109 | 404,GNB,624,GW,GNB,1962,,Guinea-Bissau,Guinea-Bissau,,Guinea-Bissau
110 | 411,EQG,226,GQ,GNQ,1962,,Equatorial Guinea,Equatorial Guinea,,Equatorial Guinea
111 | 420,GAM,270,GM,GMB,1962,,Gambia,Gambia,,Gambia
112 | 432,MLI,466,ML,MLI,1962,,Mali,Mali,,Mali
113 | 433,SEN,686,SN,SEN,1962,,Senegal,Senegal,,Senegal
114 | 434,BEN,204,BJ,BEN,1962,,Benin,Benin,,Benin
115 | 435,MAA,478,MR,MRT,1962,,Mauritania,Mauritania,,Mauritania
116 | 436,NIR,562,NE,NER,1962,,Niger,Niger,,Niger
117 | 437,CDI,384,CI,CIV,1962,,Côte d'Ivoire,Côte d'Ivoire,,Ivory Coast
118 | 438,GUI,324,GN,GIN,1962,,Guinea,Guinea,,Guinea
119 | 439,BFO,854,BF,BFA,1962,,Burkina Faso,Burkina Faso,,Burkina Faso
120 | 450,LBR,430,LR,LBR,1962,,Liberia,Liberia,,Liberia
121 | 451,SIE,694,SL,SLE,1962,,Sierra Leone,Sierra Leone,,Sierra Leone
122 | 452,GHA,288,GH,GHA,1962,,Ghana,Ghana,,Ghana
123 | 461,TOG,768,TG,TGO,1962,,Togo,Togo,,Togo
124 | 471,CAO,120,CM,CMR,1962,,Cameroon,Cameroon,,Cameroon
125 | 475,NIG,566,NG,NGA,1962,,Nigeria,Nigeria,,Nigeria
126 | 481,GAB,266,GA,GAB,1962,,Gabon,Gabon,,Gabon
127 | 482,CEN,140,CF,CAF,1962,,Central African Rep.,Central African Republic,,Central African Republic
128 | 483,CHA,148,TD,TCD,1962,,Chad,Chad,,Chad
129 | 484,CON,178,CG,COG,1962,,Congo,Congo,,Congo
130 | 490,DRC,180,CD,COD,1962,,Dem. Rep. of the Congo,Democratic Republic of the Congo,,Democratic Republic of the Congo
131 | 500,UGA,800,UG,UGA,1962,,Uganda,Uganda,,Uganda
132 | 501,KEN,404,KE,KEN,1962,,Kenya,Kenya,,Kenya
133 | 510,TAZ,834,TZ,TZA,1965,,United Rep. of Tanzania,United Republic of Tanzania,,Tanzania
134 | 511,ZAN,,,,,,Zanzibar,,,Zanzibar
135 | 516,BUI,108,BI,BDI,1962,,Burundi,Burundi,,Burundi
136 | 517,RWA,646,RW,RWA,1962,,Rwanda,Rwanda,,Rwanda
137 | 520,SOM,706,SO,SOM,1962,,Somalia,Somalia,,Somalia
138 | 522,DJI,262,DJ,DJI,1962,,Djibouti,Djibouti,,Djibouti
139 | 530,ETH,230,ET,ETH,1962,1992,Fmr Ethiopia,Former Ethiopia,,Ethiopia
140 | 530,ETH,231,ET,ETH,1993,,Ethiopia,Ethiopia,,Ethiopia
141 | 531,ERI,232,ER,ERI,1993,,Eritrea,Eritrea,,Eritrea
142 | 540,ANG,24,AO,AGO,1962,,Angola,Angola,,Angola
143 | 541,MZM,508,MZ,MOZ,1962,,Mozambique,Mozambique,,Mozambique
144 | 551,ZAM,894,ZM,ZMB,1965,,Zambia,Zambia,,Zambia
145 | 552,ZIM,716,ZW,ZWE,1965,,Zimbabwe,Zimbabwe,,Zimbabwe
146 | 553,MAW,454,MW,MWI,1965,,Malawi,Malawi,,Malawi
147 | 560,SAF,710,ZA,ZAF,2000,,South Africa,South Africa,,South Africa
148 | 560,SAF,711,ZA,ZAF,1962,1999,So. African Customs Union,Southern African Customs Union,,South Africa
149 | 565,NAM,516,NA,NAM,2000,,Namibia,Namibia,,Namibia
150 | 570,LES,426,LS,LSO,2000,,Lesotho,Lesotho,,Lesotho
151 | 571,BOT,72,BW,BWA,2000,,Botswana,Botswana,,Botswana
152 | 572,SWA,748,SZ,SWZ,2000,,Swaziland,Swaziland,,Swaziland
153 | 580,MAG,450,MG,MDG,1962,,Madagascar,Madagascar,,Madagascar
154 | 581,COM,174,KM,COM,1962,,Comoros,Comoros,,Comoros
155 | 590,MAS,480,MU,MUS,1962,,Mauritius,Mauritius,,Mauritius
156 | 591,SEY,690,SC,SYC,1962,,Seychelles,Seychelles,,Seychelles
157 | 600,MOR,504,MA,MAR,1962,,Morocco,Morocco,,Morocco
158 | 615,ALG,12,DZ,DZA,1962,,Algeria,Algeria,,Algeria
159 | 616,TUN,788,TN,TUN,1962,,Tunisia,Tunisia,,Tunisia
160 | 620,LIB,434,LY,LBY,1962,,Libya,Libya,,Libya
161 | 625,SUD,729,SD,SDN,2012,,Sudan,Sudan,"Refers to the new Sudan, now excluding the southern part.",Sudan
162 | 625,SUD,736,SD,SDN,1962,2011,Fmr Sudan,Former Sudan,,Sudan
163 | 626,SSD,728,SS,SSD,2012,,South Sudan,South Sudan,,South Sudan
164 | 630,IRN,364,IR,IRN,1962,,Iran,Iran,,Iran
165 | 640,TUR,792,TR,TUR,1962,,Turkey,Turkey,,Turkey
166 | 645,IRQ,368,IQ,IRQ,1962,,Iraq,Iraq,,Iraq
167 | 651,EGY,818,EG,EGY,1962,,Egypt,Egypt,,Egypt
168 | 652,SYR,760,SY,SYR,1962,,Syria,Syria,,Syria
169 | 660,LEB,422,LB,LBN,1962,,Lebanon,Lebanon,,Lebanon
170 | 663,JOR,400,JO,JOR,1962,,Jordan,Jordan,,Jordan
171 | 666,ISR,376,IL,ISR,1962,,Israel,Israel,,Israel
172 | 670,SAU,682,SA,SAU,1962,,Saudi Arabia,Saudi Arabia,,Saudi Arabia
173 | 678,YAR,886,YE,YEM,1962,1990,Fmr Arab Rep. of Yemen,Former Arab Republic of Yemen,,Yemen Arab Republic
174 | 679,YEM,887,YE,YEM,1991,,Yemen,Yemen,,Yemen
175 | 680,YPR,720,YD,YMD,1962,1990,Fmr Dem. Yemen,Former Democratic Yemen,,Yemen People's Republic
176 | 690,KUW,414,KW,KWT,1962,,Kuwait,Kuwait,,Kuwait
177 | 692,BAH,48,BH,BHR,1962,,Bahrain,Bahrain,,Bahrain
178 | 694,QAT,634,QA,QAT,1962,,Qatar,Qatar,,Qatar
179 | 696,UAE,784,AE,ARE,1962,,United Arab Emirates,United Arab Emirates,,United Arab Emirates
180 | 698,OMA,512,OM,OMN,1962,,Oman,Oman,,Oman
181 | 700,AFG,4,AF,AFG,1962,,Afghanistan,Afghanistan,,Afghanistan
182 | 701,TKM,795,TM,TKM,1992,,Turkmenistan,Turkmenistan,,Turkmenistan
183 | 702,TAJ,762,TJ,TJK,1992,,Tajikistan,Tajikistan,,Tajikistan
184 | 703,KYR,417,KG,KGZ,1992,,Kyrgyzstan,Kyrgyzstan,,Kyrgyzstan
185 | 704,UZB,860,UZ,UZB,1992,,Uzbekistan,Uzbekistan,,Uzbekistan
186 | 705,KZK,398,KZ,KAZ,1992,,Kazakhstan,Kazakhstan,,Kazakhstan
187 | 710,CHN,156,CN,CHN,1962,,China,China,,China
188 | 712,MON,496,MN,MNG,1962,,Mongolia,Mongolia,,Mongolia
189 | 713,TAW,,,,,,Taiwan,,,Taiwan
190 | 730,KOR,,,,,,Korea,,,Korea
191 | 731,PRK,408,KP,PRK,1962,,Dem. People's Rep. of Korea,Democratic People's Republic of Korea,,North Korea
192 | 732,ROK,410,KR,KOR,1962,,Rep. of Korea,Republic of Korea,,South Korea
193 | 740,JPN,392,JP,JPN,1962,,Japan,Japan,,Japan
194 | 750,IND,356,IN,IND,1962,1974,"India, excl. Sikkim","India, excluding Sikkim",,India
195 | 750,IND,699,IN,IND,1975,,India,India,,India
196 | 760,BHU,64,BT,BTN,1962,,Bhutan,Bhutan,,Bhutan
197 | 770,PAK,586,PK,PAK,1972,,Pakistan,Pakistan,,Pakistan
198 | 770,PAK,588,PK,PAK,1962,1971,East and West Pakistan,East and West Pakistan,,Pakistan
199 | 771,BNG,50,BD,BGD,1972,,Bangladesh,Bangladesh,,Bangladesh
200 | 775,MYA,104,MM,MMR,1962,,Myanmar,Myanmar,,Myanmar
201 | 780,SRI,144,LK,LKA,1962,,Sri Lanka,Sri Lanka,,Sri Lanka
202 | 781,MAD,462,MV,MDV,1962,,Maldives,Maldives,,Maldives
203 | 790,NEP,524,NP,NPL,1962,,Nepal,Nepal,,Nepal
204 | 800,THI,764,TH,THA,1962,,Thailand,Thailand,,Thailand
205 | 811,CAM,116,KH,KHM,1962,,Cambodia,Cambodia,,Cambodia
206 | 812,LAO,418,LA,LAO,1962,,Lao People's Dem. Rep.,Lao People's Dem. Rep.,,Laos
207 | 816,DRV,,,,,,Vietnam,,,Vietnam
208 | 817,RVN,868,VN,VNM,1962,1974,Fmr Rep. of Vietnam,Former Republic of Vietnam,,Republic of Vietnam
209 | 817,RVN,704,VN,VNM,1975,,Viet Nam,Viet Nam,,Republic of Vietnam
210 | 820,MAL,458,MY,MYS,1964,,Malaysia,Malaysia,,Malaysia
211 | 830,SIN,702,SG,SGP,1962,,Singapore,Singapore,,Singapore
212 | 835,BRU,96,BN,BRN,1962,,Brunei Darussalam,Brunei Darussalam,,Brunei
213 | 840,PHI,608,PH,PHL,1962,,Philippines,Philippines,,Philippines
214 | 850,INS,360,ID,IDN,1962,,Indonesia,Indonesia,,Indonesia
215 | 860,ETM,626,TL,TLS,1962,,Timor-Leste,Timor-Leste,,East Timor
216 | 900,AUL,36,AU,AUS,1962,,Australia,Australia,,Australia
217 | 910,PNG,598,PG,PNG,1962,,Papua New Guinea,Papua New Guinea,,Papua New Guinea
218 | 920,NEW,554,NZ,NZL,1962,,New Zealand,New Zealand,,New Zealand
219 | 935,VAN,548,VU,VUT,1962,,Vanuatu,Vanuatu,,Vanuatu
220 | 940,SOL,90,SB,SLB,1962,,Solomon Isds,Solomon Islands,,Solomon Islands
221 | 946,KIR,296,KI,KIR,1962,,Kiribati,Kiribati,,Kiribati
222 | 947,TUV,798,TV,TUV,1962,,Tuvalu,Tuvalu,,Tuvalu
223 | 950,FIJ,242,FJ,FJI,1962,,Fiji,Fiji,,Fiji
224 | 955,TON,776,TO,TON,1962,,Tonga,Tonga,,Tonga
225 | 970,NAU,520,NR,NRU,1962,,Nauru,Nauru,,Nauru
226 | 983,MSI,584,MH,MHL,1992,,Marshall Isds,Marshall Islands,,Marshall Islands
227 | 986,PAL,585,PW,PLW,1992,,Palau,Palau,,Palau
228 | 987,FSM,583,FM,FSM,1992,,FS Micronesia,Federated State of Micronesia,,Federated States of Micronesia
229 | 990,WSM,882,WS,WSM,1962,,Samoa,Samoa,,Samoa
230 | ,,533,AW,ABW,1988,,Aruba,Aruba,,
231 | ,,660,AI,AIA,1981,,Anguilla,Anguilla,,
232 | ,,532,AN,ANT,1962,1987,Neth. Antilles and Aruba,Netherlands Antilles and Aruba,,
233 | ,,530,AN,ANT,1988,2010,Neth. Antilles,Netherlands Antilles,,
234 | ,,16,AS,ASM,1962,,American Samoa,American Samoa,,
235 | ,,10,AQ,ATA,1962,,Antarctica,Antarctica,,
236 | ,,260,FQ,ATF,1962,,Fr. South Antarctic Terr.,French South Antarctic Territories,,
237 | ,,535,BQ,BES,2011,,Bonaire,"Bonaire, Saint Eustatius and Saba",,
238 | ,,652,BL,BLM,2013,,Saint Barthélemy,Saint Barthélemy,,
239 | ,,60,BM,BMU,1962,,Bermuda,Bermuda,,
240 | ,,166,CC,CCK,1962,,Cocos Isds,Cocos Islands,,
241 | ,,184,CK,COK,1962,,Cook Isds,Cook Islands,,
242 | ,,531,CW,CUW,2010,,Curaçao,Curaçao,,
243 | ,,162,CX,CXR,1962,,Christmas Isds,Christmas Islands,,
244 | ,,136,KY,CYM,1962,,Cayman Isds,Cayman Islands,,
245 | ,,732,EH,ESH,1962,,Western Sahara,Western Sahara,,
246 | ,,97,EU,EU2,1962,,EU-28,EU-28,,
247 | ,,238,FK,FLK,1962,,Falkland Isds (Malvinas),Falkland Islands (Malvinas),,
248 | ,,234,FO,FRO,1962,,Faeroe Isds,Faeroe Islands,,
249 | ,,292,GI,GIB,1962,,Gibraltar,Gibraltar,,
250 | ,,312,GP,GLP,1962,1995,Guadeloupe,Guadeloupe,,
251 | ,,304,GL,GRL,1962,,Greenland,Greenland,,
252 | ,,254,GF,GUF,1962,1995,French Guiana,French Guiana,,
253 | ,,316,GU,GUM,1962,,Guam,Guam,,
254 | ,,344,HK,HKG,1962,,"China, Hong Kong SAR","China, Hong Kong Special Administrative Region",,
255 | ,,334,HM,HMD,1962,,Heard Island and McDonald Islands,Heard Island and McDonald Islands,,
256 | ,,86,IO,IOT,1962,,Br. Indian Ocean Terr.,British Indian Ocean Territories,,
257 | ,,580,MP,MNP,1992,,N. Mariana Isds,Northern Mariana Islands,,
258 | ,,500,MS,MSR,1962,,Montserrat,Montserrat,,
259 | ,,474,MQ,MTQ,1962,1995,Martinique,Martinique,,
260 | ,,175,YT,MYT,1962,,Mayotte,Mayotte,,
261 | ,,129,N/A,N/A,1962,2004,"Caribbean, nes","Caribbean, not elsewhere specified",,
262 | ,,838,N/A,N/A,1962,,Free Zones,Free Zones,,
263 | ,,221,N/A,N/A,1962,2004,"Eastern Europe, nes","Eastern Europe, not elsewhere specified",,
264 | ,,837,N/A,N/A,1962,,Bunkers,Bunkers,,
265 | ,,835,N/A,N/A,1962,1964,Fmr Tanganyika,Former Tanganyika,,
266 | ,,490,N/A,N/A,1962,,"Other Asia, nes","Other Asia, not elsewhere specified",,
267 | ,,577,N/A,N/A,1962,,"Other Africa, nes","Other Africa, not elsewhere specified",,
268 | ,,849,N/A,N/A,1962,1962,US Misc. Pacific Isds,US Miscellaneous Pacific Islands,,
269 | ,,290,N/A,N/A,1962,2004,"Northern Africa, nes","Northern Africa, not elsewhere specified",,
270 | ,,459,N/A,N/A,1962,1963,Peninsula Malaysia,Peninsula Malaysia,,
271 | ,,836,N/A,N/A,1962,1964,Fmr Zanzibar and Pemba Isd,Former Zanzibar and Pemba Island,,
272 | ,,717,N/A,N/A,1962,1964,Fmr Rhodesia Nyas,Former Rhodesia Nyas,,
273 | ,,527,N/A,N/A,1962,,"Oceania, nes","Oceania, not elsewhere specified",,
274 | ,,899,N/A,N/A,1962,,"Areas, nes","Areas, not elsewhere specified",,
275 | ,,473,N/,N/A,1962,,"LAIA, nes","LAIA, not elsewhere specified",,
276 | ,,647,N/A,N/A,1962,1972,Ryukyu Isd,Ryukyu Island,,
277 | ,,698,N/A,N/A,1962,1974,Sikkim,Sikkim,,
278 | ,,492,N/A,N/A,1962,2004,"Europe EU, nes","Europe EU, not elsewhere specified",Code 492 is mapped to code 568 since 2005,
279 | ,,80,N/A,N/A,1962,,Br. Antarctic Terr.,British Antarctic Territories,,
280 | ,,74,N/A,N/A,1962,,Bouvet Island,Ile Bouvet,,
281 | ,,568,N/A,N/A,1962,,"Other Europe, nes","Other Europe, not elsewhere specified",,
282 | ,,697,N/A,N/A,1962,2004,"Europe EFTA, nes","Europe EFTA, not elsewhere specified",,
283 | ,,879,N/A,N/A,1962,2004,"Western Asia, nes","Western Asia, not elsewhere specified",,
284 | ,,636,N/A,N/A,1962,2004,"Rest of America, nes","Rest of America, not elsewhere specified",,
285 | ,,839,N/A,N/A,1962,,Special Categories,Special Categories,,
286 | ,,471,N/A,N/A,1962,2004,"CACM, nes","CACM, not elsewhere specified",,
287 | ,,472,N/A,N/A,1962,2004,"Africa CAMEU region, nes","Africa CAMEU region, not elsewhere specified",,
288 | ,,461,N/A,N/A,1962,1963,Sabah,Sabah,,
289 | ,,637,N/A,N/A,1962,,"North America and Central America, nes","North America, the Caribbean and Central America not elsewhere specified",,
290 | ,,536,N/A,N/A,1962,,Neutral Zone,Neutral Zone,,
291 | ,,457,N/A,N/A,1962,1963,Sarawak,Sarawak,,
292 | ,,540,NC,NCL,1962,,New Caledonia,New Caledonia,,
293 | ,,574,NF,NFK,1962,,Norfolk Isds,Norfolk Islands,,
294 | ,,570,NU,NIU,1962,,Niue,Niue,,
295 | ,,582,PC,PCI,1962,1991,Fmr Pacific Isds,Former Pacific Islands,,
296 | ,,612,PN,PCN,1962,,Pitcairn,Pitcairn,,
297 | ,,592,PZ,PCZ,1962,1977,Fmr Panama-Canal-Zone,Former Panama-Canal-Zone,,
298 | ,,275,PS,PSE,2000,,State of Palestine,State of Palestine,,
299 | ,,258,PF,PYF,1962,,French Polynesia,French Polynesia,,
300 | ,,638,RE,REU,1962,1995,Réunion,Réunion,,
301 | ,,891,CS,SCG,1992,2005,Serbia and Montenegro,Serbia and Montenegro,,
302 | ,,239,GS,SGS,1962,,South Georgia and the South Sandwich Islands,South Georgia and the South Sandwich Islands,,
303 | ,,654,SH,SHN,1962,,Saint Helena,Saint Helena,,
304 | ,,666,PM,SPM,1962,,Saint Pierre and Miquelon,Saint Pierre and Miquelon,,
305 | ,,688,RS,SRB,2006,,Serbia,Serbia,,
306 | ,,810,SU,SUN,1962,1991,Fmr USSR,Former USSR,,
307 | ,,534,SX,SXM,2010,,Saint Maarten,Saint Maarten (Dutch part),,
308 | ,,796,TC,TCA,1962,,Turks and Caicos Isds,Turks and Caicos Islands,,
309 | ,,772,TK,TKL,1962,,Tokelau,Tokelau,,
310 | ,,581,UM,UMI,1962,,United States Minor Outlying Islands,United States Minor Outlying Islands,United States Minor Outlying Islands,
311 | ,,336,VA,VAT,2000,,Holy See (Vatican City State),Holy See (Vatican City State),,
312 | ,,866,VD,VDR,1962,1974,Fmr Dem. Rep. of Vietnam,Former Democratic Republic of Vietnam,,
313 | ,,92,VG,VGB,1962,,Br. Virgin Isds,British Virgin Islands,,
314 | ,,850,VI,VIR,1962,1980,US Virgin Isds,US Virgin Islands,,
315 | ,,0,WL,WLD,1962,,World,World,,
316 | ,,876,WF,WLF,1962,,Wallis and Futuna Isds,Wallis and Futuna Islands,,
317 | 


--------------------------------------------------------------------------------
/NGEC/assets/event_mode_questions.csv:
--------------------------------------------------------------------------------
 1 | event_type,mode,ACTOR,RECIP,LOCATION,TIME,notes
 2 | ASSAULT,,"Who carried out an attack on {recip_text}?
 3 | Who attacked {recip_text}?","Who was attacked by {actor_text}?
 4 | Who did {actor_text} attack?",Where did the attack take place?,When did the attack take place?,
 5 | COERCE,,Who coerced {recip_text}?,"Who was coerced by {actor_text}?
 6 | Who did {actor_text} coerce?",Where did the coercion occur?,When did the coercion occur?,
 7 | PROTEST,,"Who protested?
 8 | Who protested against {recip_text}?","Who was the protest directed against?
 9 | Who was {actor_text} protesting against?",Where did the protest take place?,When did the protest take place?,
10 | MOBILIZE,,"Who mobilized forces?
11 | Who mobilized forces against {recip_text}?","Who did {actor_text} mobilize forces against?
12 | Who was the mobilization directed against?",Where were forces mobilized?,When were forces mobilized?,
13 | SANCTION,,"Who sanctioned {recip_text}?
14 | Who punished {recip_text}?
15 | Who imposed sanctions against {recip_text}?","Who was sanctioned?
16 | Who was sanctioned by {actor_text}?",Where were sanctions imposed?,When were sanctions imposed?,
17 | THREATEN,,"Who threatened {recip_text}?
18 | Who made a threat against {recip_text}?","Who was threatened?
19 | Who was threatened by {actor_text}?",Where was the threat made?,When was the threat made?,
20 | REQUEST,,"Who made a request?
21 | Who requested something from {recip_text}?","Who did {actor_text} request something from?
22 | Who was the request directed toward?",Where was the request made?,When was the request made?,
23 | REJECT,,Who rejected something?,"Whose claim was rejected by someone?
24 | Whose claim was rejected by {actor_text}?",Where did the rejection occur?,When did the rejection occur?,
25 | ACCUSE,,"Who accused {recip_text} of something?
26 | Who accused {recip_text}?","Who was the accusation against?
27 | Who did {actor_text} accuse of something?",Where did the accusation occur?,When did the accusation occur?,
28 | CONSULT,,"Who consulted?
29 | Who did {recip_text} consult with?","Who consulted?
30 | Who did {actor_text} consult with?
31 | Who was consulted?",Where did the consultation occur?,When did the consultation occur?,
32 | AGREE,,Who agreed with {recip_text}?,Who did {actor_text} agree with?,Where was the agreement reached?,When was the agreement reached?,
33 | SUPPORT,,"Who gave support to {recip_text}?
34 | Who expressed support for {recip_text}?","Who received support?
35 | Who was supported by {actor_text}?",Where was support expressed?,When was support expressed?,
36 | CONCEDE,,"Who made a concession?
37 | Who made a concession to {recip_text}?","Who was the concession made to?
38 | Who did {actor_text} make a concession to?",Where were concessions made?,When were concessions made?,
39 | COOPERATE,,"Who cooperated?
40 | Who cooperated with {recip_text}?","Who cooperated?
41 | Who cooperated with {actor_text}?",Where was the agreement signed?,When was the agreement signed?,
42 | AID,,"Who gave aid?
43 | Who gave aid to {recip_text}?","Who received aid?
44 | Who got aid from {actor_text}?",Where was the aid delivered?,When was the aid delivered?,
45 | RETREAT,,"Who retreated or gave something up?
46 | Who retreated from {recip_text}?","Who did someone retreat from?
47 | Who did {actor_text} retreat from?",Where did the retreat occur?,When did the retreat occur?,
48 | COERCE,seize,Who seized something from {recip_text}?,Who did {actor_text} seize something from?,Where did the seizure of property take place?,When did the seizure of property take place?,
49 | COERCE,restrict,"Who imposed restrictions?
50 | Who restricted the rights of {recip_text}?",Whose rights were restricted by {actor_text}?,Where were restrictions imposed?,When were restrictions imposed?,
51 | COERCE,ban,Who banned {recip_text}?,Who did {actor_text} ban?,Where was the ban applied?,When was the ban applied?,
52 | COERCE,censor,"Who did the censoring?
53 | Who censored {recip_text}?","Who did {actor_text} censor?
54 | Who was censored?",Where was censoring applied?,When was censoring applied?,
55 | COERCE,curfew,"Who imposed a curfew?
56 | Who restricted {recip_text}'s movement?","Who did {actor_text} impose a curfew on?
57 | Whose movement was restricted?
58 | Whose movement was restricted by {actor_text}?",Where was the curfew imposed?,When was the curfew imposed?,
59 | COERCE,martial-law,Who declared martial law?,Who did {actor_text} impose martial law on?,Where was martial law declared?,When was martial law declared?,
60 | COERCE,arrest,"Who arrested {recip_text}?
61 | Who detained {recip_text}?","Who was arrested by {actor_text}?
62 | Who was detained by {actor_text}?",Where were arrests made?,When were arrests made?,
63 | COERCE,deport,Who deported {recip_text}?,Who did {actor_text} deport?,Where was the deportation made from?,When was the deportation made from?,
64 | COERCE,withhold,Who withheld or shut off supplies for {recip_text}?,Who did {actor_text} withhold or shut off supplies to?,Where were supplies restricted?,When were supplies restricted?,
65 | COERCE,misinformation,Who carried out a misinformation campaign against {recip_text}?,Who did {actor_text} carry out a misinformation campaign against?,Where did the misinformation campaign occur?,When did the misinformation campaign occur?,


--------------------------------------------------------------------------------
/NGEC/assets/event_models/ACCUSE.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/ACCUSE.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/AGREE.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/AGREE.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/AID.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/AID.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/ASSAULT.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/ASSAULT.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/COERCE.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/COERCE.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/CONCEDE.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/CONCEDE.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/CONSULT.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/CONSULT.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/COOPERATE.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/COOPERATE.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/MOBILIZE.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/MOBILIZE.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/PROTEST.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/PROTEST.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/REJECT.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/REJECT.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/REQUEST.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/REQUEST.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/RETREAT.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/RETREAT.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/SANCTION.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/SANCTION.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/SUPPORT.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/SUPPORT.skops


--------------------------------------------------------------------------------
/NGEC/assets/event_models/THREATEN.skops:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/event_models/THREATEN.skops


--------------------------------------------------------------------------------
/NGEC/assets/option_model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/option_model.pt


--------------------------------------------------------------------------------
/NGEC/assets/pattern_matrix.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/pattern_matrix.npy


--------------------------------------------------------------------------------
/NGEC/assets/pattern_matrix.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/assets/pattern_matrix.pkl


--------------------------------------------------------------------------------
/NGEC/context_class.py:
--------------------------------------------------------------------------------
 1 | import sklearn
 2 | 
 3 | def _load_model(model_dir):
 4 |     """
 5 |     Load the context classification models. 
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     model_dir: Path
10 |       path to the event classification models
11 | 
12 |     Returns
13 |     ------
14 |     model_dict: dict
15 |       With event classes as keys and models as values.
16 |     """
17 |     raise NotImplementedError()
18 | 
19 | 
20 | class EventClass:
21 |     def __init__(self, 
22 |                  model_dir="assets/event_class_models/",
23 |                  threshold=0.6 # we can set stuff like this here
24 |                  ):
25 | 
26 |         self.model_dict = _load_model(model_dir)
27 |         self.threshold = threshold
28 |     
29 | 
30 |     def process(self, story_list):
31 |         """
32 |         Process a list of stories to detect the document context.
33 |         
34 |         Example
35 |         -------
36 |         The input is a list of dictionaries, each with an 'event_text' key with the full text of the story.
37 | 
38 |         {'date': '2019-08-01',
39 |          'event_text': 'Indonesia is investigating a report that ... ',
40 |          'headline': 'Indonesia says it is probing a report of a ...',
41 |          'id': '<internal document id>',
42 |          'pub_date': '2019-08-01',
43 |          'publisher': '<publisher name>',
44 |          'story_id': '<recommended ID/url of original text>',
45 |          'version': '<optional version number>'} 
46 | 
47 |         Parameters
48 |         ----------
49 |         story_list: list of dicts
50 |           Each dictionary must have a 'event_text' key with the full text of the story.
51 |         
52 |         Returns
53 |         -------
54 |         story_list: list of dicts
55 |           Each story dictionary now contains an "contexts" key with a list of detected contexts (str). E.g.: 
56 |           'contexts': ['diplomatic', 'legal']
57 | 
58 | 
59 |         """
60 |         raise NotImplementedError()
61 | 
62 |         for story in stories:
63 |             if event_text not in story.keys():
64 |                 raise ValueError("No 'event_text' key in input.")


--------------------------------------------------------------------------------
/NGEC/event_class.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | from sklearn.linear_model import LogisticRegression
 3 | # a safer pickle alternative
 4 | import skops.io as sio
 5 | import numpy as np
 6 | 
 7 | 
 8 | class EventClass:
 9 |     def __init__(self, 
10 |                  model_dir="NGEC/assets/event_models/",
11 |                  threshold=0.6, 
12 |                  progress_bar=False,
13 |                  event_types = ['ACCUSE', 'AGREE', 'AID', 'ASSAULT', 'COERCE', 'CONCEDE',
14 |                                 'CONSULT', 'COOPERATE', 'MOBILIZE', 'PROTEST', 'REJECT', 'REQUEST',
15 |                                 'RETREAT', 'SANCTION', 'SUPPORT', 'THREATEN']
16 |                  ):
17 |         self.model = SentenceTransformer('sentence-transformers/paraphrase-mpnet-base-v2')
18 |         self.model_dir = model_dir
19 |         self.threshold = threshold
20 |         self.progress_bar = progress_bar
21 |         self.event_types = event_types
22 |         self.model_dict = self._load_model(model_dir)
23 |         print("Event classification models loaded. NOTE: these models are not the production models used to produce the POLECAT dataset. Instead, these are demonstration models for the PLOVER ontology trained on synthetic text. If you are making custom event data, you'll need to train your own models. See the `setup` directory in the NGEC repo (github.com/ahalterman/NGEC)`.")
24 |     
25 |     def _load_model(self, model_dir):
26 |         """
27 |         Load the event classification models. 
28 | 
29 |         Parameters
30 |         ----------
31 |         model_dir: Path
32 |           path to the event classification models
33 | 
34 |         Returns
35 |         ------
36 |         model_dict: dict
37 |           With event classes as keys and models as values.
38 |         """
39 |         model_dict = {}
40 |         for event in self.event_types:
41 |             model_dict[event] = sio.load(f"{model_dir}/{event}.skops")
42 |         return model_dict
43 | 
44 |     def process(self, story_list):
45 |         """
46 |         Process a list of stories to detect the event class.
47 |         
48 |         Example
49 |         -------
50 |         The input is a list of dictionaries, each with an 'event_text' key with the full text of the story.
51 | 
52 |         {'date': '2019-08-01',
53 |          'event_text': 'Indonesia is investigating a report that ... ',
54 |          'headline': 'Indonesia says it is probing a report of a ...',
55 |          'id': '<internal document id>',
56 |          'pub_date': '2019-08-01',
57 |          'publisher': '<publisher name>',
58 |          'story_id': '<recommended ID/url of original text>',
59 |          'version': '<optional version number>'} 
60 | 
61 |         Parameters
62 |         ----------
63 |         story_list: list of dicts
64 |           Each dictionary must have a 'text' key with the full text of the story.
65 |         
66 |         Returns
67 |         -------
68 |         story_list: list of dicts
69 |           Each story dictionary now contains an 'event_type' key with a list of detected events (str). E.g.: 
70 |           'event_type': ['SANCTION', 'MOBILIZE']
71 | 
72 |         """
73 |         text = [i['event_text'] for i in story_list]
74 |         embeddings = self.model.encode(text, show_progress_bar=self.progress_bar)
75 | 
76 |         preds = []
77 |         for event, clf in self.model_dict.items():
78 |             y_pred = clf.predict_proba(embeddings)[:,1]
79 |             preds.append(y_pred)
80 | 
81 |         pred_array = np.array(preds).T
82 | 
83 |         # convert the matrix of predictions to a list of lists
84 |         preds = []
85 |         for i in pred_array:
86 |             preds.append([self.event_types[j] for j in np.where(i > self.threshold)[0]])
87 | 
88 |         for n, story in enumerate(story_list):
89 |             story['event_type'] = preds[n]
90 | 
91 |         return story_list
92 | 


--------------------------------------------------------------------------------
/NGEC/formatter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rich import print
  3 | import jsonlines
  4 | import numpy as np
  5 | import pandas as pd
  6 | import os
  7 | import dateparser
  8 | from datetime import datetime
  9 | import re
 10 | 
 11 | import logging
 12 | logger = logging.getLogger(__name__)
 13 | logger.addHandler(logging.NullHandler())
 14 | 
 15 | # silence dateparser warning. https://github.com/scrapinghub/dateparser/issues/1013
 16 | import warnings
 17 | warnings.filterwarnings(
 18 |     "ignore",
 19 |     message="The localize method is no longer necessary, as this time zone supports the fold attribute",
 20 | )
 21 | 
 22 | 
 23 | def country_name_dict(base_path):
 24 |     file = os.path.join(base_path, "countries.csv")
 25 |     countries = pd.read_csv(file)
 26 |     country_name_dict = {i:j for i, j in zip(countries['CCA3'], countries['Name'])}
 27 |     country_name_dict.update({"": ""})
 28 |     country_name_dict.update({"IGO": "Intergovernmental Organization"})
 29 |     return country_name_dict
 30 | 
 31 | def resolve_date(event):
 32 |     """
 33 |     Create a new 'date_resolved' key with a date in YYYY-MM-DD format
 34 | 
 35 |     TODO:
 36 |     include granularity details (e.g. month, year.)?
 37 |     >>> DateDataParser().get_date_data('March 2015')
 38 |     DateData(date_obj=datetime.datetime(2015, 3, 16, 0, 0), period='month', locale='en')
 39 |     """
 40 |     if 'DATE' not in event['attributes'].keys():
 41 |         pub_date = dateparser.parse(event['pub_date']).strftime("%Y-%m-%d")
 42 |         event['date_resolved'] = pub_date
 43 |         event['date_raw'] = "No date detected--using publication date"
 44 |         return event
 45 |     if not event['attributes']['DATE']:
 46 |         pub_date = dateparser.parse(event['pub_date']).strftime("%Y-%m-%d")
 47 |         event['date_resolved'] = pub_date
 48 |         event['date_raw'] = "<No date detected--using publication date>"
 49 |         return event
 50 |     
 51 |     base_date = dateparser.parse(event['pub_date'])
 52 |     raw_date = event['attributes']['DATE'][0]['text']
 53 |     print(f"raw_date: {raw_date}")
 54 |     
 55 |     resolved_date = dateparser.parse(date_string=raw_date, settings={'RELATIVE_BASE': base_date,
 56 |                                                                     'PREFER_DATES_FROM': "past"})
 57 |     if not resolved_date:
 58 |         if re.search("next|later", raw_date):
 59 |             raw_date = re.sub(r"next|later", "", raw_date).strip()
 60 |             resolved_date = dateparser.parse(date_string=raw_date, settings={'RELATIVE_BASE': base_date,
 61 |                                                                     'PREFER_DATES_FROM': "future"})
 62 |             if resolved_date:
 63 |                 event['date_resolved'] = resolved_date.strftime("%Y-%m-%d")
 64 |                 event['date_raw'] = raw_date
 65 |                 return event
 66 |     if not resolved_date:
 67 |         event['date_resolved'] = event['pub_date']
 68 |         event['date_raw'] = "<dateparser failed to convert relative date--using pub date>"
 69 |         return event
 70 |     else:
 71 |         event['date_resolved'] = resolved_date.strftime("%Y-%m-%d")
 72 |         event['date_raw'] = raw_date
 73 |         return event
 74 | 
 75 | 
 76 | class Formatter:
 77 |     def __init__(self, quiet=False, base_path="assets", geolocation_threshold=0.85):
 78 |         self.quiet = quiet
 79 |         self.base_path = base_path
 80 |         self.iso_to_name = country_name_dict(self.base_path)
 81 |         self.geo_threshold = geolocation_threshold
 82 | 
 83 |     """
 84 |     event = {   'attributes': {   'ACTOR': [{   'qa_end_char': 53,
 85 |                                    'qa_score': 0.31743326783180237,
 86 |                                    'qa_start_char': 39,
 87 |                                    'text': 'Nicolas Maduro',
 88 |                                    'score': 0.23675884306430817,
 89 |                                    'wiki': 'Nicolás Maduro',
 90 |                                    'country': 'VEN',
 91 |                                    'code_1': 'ELI',
 92 |                                    'code_2': ''}],
 93 |                       'LOC': [{   'qa_end_char': 156,
 94 |                                  'qa_score': 0.4355418384075165,
 95 |                                  'qa_start_char': 148,
 96 |                                  'text': 'Barbados'}],
 97 |                       'RECIP': [{   'qa_end_char': 90,
 98 |                                    'qa_score': 0.1324695497751236,
 99 |                                    'qa_start_char': 79,
100 |                                    'score': 0.13248120248317719,
101 |                                    'wiki': 'Juan Guaidó',
102 |                                    'country': 'VEN',
103 |                                    'code_1': 'REB',
104 |                                    'code_2': '',
105 |                                    'text': 'Juan Guaidó'}]},
106 |     'contexts': ['pro_democracy'],
107 |     'date': '2019-08-01',
108 |     'event_geolocation': {   'admin1_code': '00',
109 |                              'admin1_name': '',
110 |                              'admin2_code': '',
111 |                              'admin2_name': '',
112 |                              'country_code3': 'BRB',
113 |                              'end_char': 156,
114 |                              'event_location_overlap_score': 1.0,
115 |                              'feature_class': 'A',
116 |                              'feature_code': 'PCLI',
117 |                              'geonameid': '3374084',
118 |                              'lat': 13.16453,
119 |                              'lon': -59.55165,
120 |                              'resolved_placename': 'Barbados',
121 |                              'score': 1.0,
122 |                              'search_placename': 'Barbados',
123 |                              'start_char': 148},
124 |     'event_mode': [],
125 |     'event_text': 'Delegates of the Venezuelan president, Nicolas Maduro, and '
126 |                   'the leader objector Juan Guaidó resumed on Wednesday (31) '
127 |                   'conversations on the island of Barbados, sponsored by '
128 |                   'Norway, to seek a way out of the crisis in their country, '
129 |                   'announced the parties. "We started another round of '
130 |                   'sanctions under the mechanism of Oslo," indicated on '
131 |                   'Twitter Mr Stalin González, one of the envoys of Guaidó, '
132 |                   'parliamentary leader recognized as interim president by '
133 |                   'half hundred countries. The vice-president of Venezuela, '
134 |                   'Delcy Rodríguez, confirmed in a press conference that '
135 |                   'representatives of mature traveled to Barbados for the '
136 |                   'meetings with the opposition. Mature reaffirmed in a '
137 |                   'message to the nation that the government seeks to '
138 |                   'establish a "bureau for permanent dialog with the '
139 |                   'opposition, and called entrepreneurs and social movements '
140 |                   'to be added to the process. After exploratory '
141 |                   'approximations and a first face to face in Oslo in mid-May, '
142 |                   'the parties have transferred the dialog on 8 July for the '
143 |                   'caribbean island. The opposition search in the negotiations '
144 |                   'the output of mature and a new election, by considering '
145 |                   'that his second term, started last January, resulted from '
146 |                   'fraudulent elections, not recognized by almost 60 '
147 |                   'countries, among them the United States. ',
148 |     'event_type': 'RETREAT',
149 |     'geolocated_ents': [   {   'admin1_code': '00',
150 |                                'admin1_name': '',
151 |                                'admin2_code': '',
152 |                                'admin2_name': '',
153 |                                'country_code3': 'BRB',
154 |                                'end_char': 156,
155 |                                'event_location_overlap_score': 1.0,
156 |                                'feature_class': 'A',
157 |                                'feature_code': 'PCLI',
158 |                                'geonameid': '3374084',
159 |                                'lat': 13.16453,
160 |                                'lon': -59.55165,
161 |                                'resolved_placename': 'Barbados',
162 |                                'score': 1.0,
163 |                                'search_placename': 'Barbados',
164 |                                'start_char': 148},
165 |                            {   'admin1_code': '00',
166 |                                'admin1_name': '',
167 |                                'admin2_code': '',
168 |                                'admin2_name': '',
169 |                                'country_code3': 'NOR',
170 |                                'end_char': 177,
171 |                                'feature_class': 'A',
172 |                                'feature_code': 'PCLI',
173 |                                'geonameid': '3144096',
174 |                                'lat': 62.0,
175 |                                'lon': 10.0,
176 |                                'resolved_placename': 'Kingdom of Norway',
177 |                                'score': 1.0,
178 |                                'search_placename': 'Norway',
179 |                                'start_char': 171},
180 |                            {   'admin1_code': '12',
181 |                                'admin1_name': 'Oslo',
182 |                                'admin2_code': '0301',
183 |                                'admin2_name': 'Oslo',
184 |                                'country_code3': 'NOR',
185 |                                'end_char': 318,
186 |                                'feature_class': 'P',
187 |                                'feature_code': 'PPLC',
188 |                                'geonameid': '3143244',
189 |                                'lat': 59.91273,
190 |                                'lon': 10.74609,
191 |                                'resolved_placename': 'Oslo',
192 |                                'score': 1.0,
193 |                                'search_placename': 'Oslo',
194 |                                'start_char': 314},
195 |                            {   'admin1_code': '00',
196 |                                'admin1_name': '',
197 |                                'admin2_code': '',
198 |                                'admin2_name': '',
199 |                                'country_code3': 'VEN',
200 |                                'end_char': 502,
201 |                                'feature_class': 'A',
202 |                                'feature_code': 'PCLI',
203 |                                'geonameid': '3625428',
204 |                                'lat': 8.0,
205 |                                'lon': -66.0,
206 |                                'resolved_placename': 'Bolivarian Republic of '
207 |                                                      'Venezuela',
208 |                                'score': 1.0,
209 |                                'search_placename': 'Venezuela',
210 |                                'start_char': 493},
211 |                            {   'admin1_code': '00',
212 |                                'admin1_name': '',
213 |                                'admin2_code': '',
214 |                                'admin2_name': '',
215 |                                'country_code3': 'BRB',
216 |                                'end_char': 604,
217 |                                'feature_class': 'A',
218 |                                'feature_code': 'PCLI',
219 |                                'geonameid': '3374084',
220 |                                'lat': 13.16453,
221 |                                'lon': -59.55165,
222 |                                'resolved_placename': 'Barbados',
223 |                                'score': 1.0,
224 |                                'search_placename': 'Barbados',
225 |                                'start_char': 596},
226 |                            {   'admin1_code': '12',
227 |                                'admin1_name': 'Oslo',
228 |                                'admin2_code': '0301',
229 |                                'admin2_name': 'Oslo',
230 |                                'country_code3': 'NOR',
231 |                                'end_char': 918,
232 |                                'feature_class': 'P',
233 |                                'feature_code': 'PPLC',
234 |                                'geonameid': '3143244',
235 |                                'lat': 59.91273,
236 |                                'lon': 10.74609,
237 |                                'resolved_placename': 'Oslo',
238 |                                'score': 1.0,
239 |                                'search_placename': 'Oslo',
240 |                                'start_char': 914},
241 |                            {   'admin1_code': '00',
242 |                                'admin1_name': '',
243 |                                'admin2_code': '',
244 |                                'admin2_name': '',
245 |                                'country_code3': 'USA',
246 |                                'end_char': 1259,
247 |                                'feature_class': 'A',
248 |                                'feature_code': 'PCLI',
249 |                                'geonameid': '6252001',
250 |                                'lat': 39.76,
251 |                                'lon': -98.5,
252 |                                'resolved_placename': 'United States',
253 |                                'score': 1.0,
254 |                                'search_placename': 'United States',
255 |                                'start_char': 1239}],
256 |     'headline': 'Governo e oposição da Venezuela retomam diálogo em Barbados\n',
257 |     'id': '20190801-2309-4e081644904c_COOPERATE_R',
258 |     'pub_date': '2019-08-01',
259 |     'publisher': 'translateme2-pt',
260 |     'story_id': 'AFPPT00020190801ef81000jh:50066619',
261 |     'story_people': ['Delcy Rodríguez', 'Guaidó', 'Nicolas Maduro', 'Stalin González', 'Juan Guaidó'],
262 |     'story_orgs': ['Mature'],
263 |     'story_locs': ['Norway', 'United States', 'Barbados', 'Oslo', 'Venezuela'],
264 |     'version': 'NGEC_coder-Vers001-b1-Run-001'}
265 |     """
266 | 
267 |     def find_event_loc(self, event, geo_overlap_thresh=0.5):
268 |         if 'LOC' not in event['attributes'].keys():
269 |             event['event_geolocation'] = {"reason": "No LOC attribute found by the QA/attribute model",
270 |                                           "geo": None}
271 |             return event
272 |         try:
273 |             event_loc_raw = event['attributes']['LOC'][0] ## NOTE!! Assuming just one location from the QA model
274 |         except IndexError:
275 |             event['event_geolocation'] = {"reason": "No LOC attribute found by the QA/attribute model",
276 |                                           "geo": None}
277 |             return event
278 |         if not event_loc_raw:
279 |             event['event_geolocation'] = {"reason": "No LOC attribute found by the QA/attribute model",
280 |                                           "geo": None}
281 |             return event
282 |         if 'geolocated_ents' not in event.keys():
283 |             event['event_geolocation'] = {"reason": "No story locations were geolocated (Missing 'geolocated_ents' key).",
284 |                                           "geo": None}
285 |             return event
286 |         event_loc_chars = set(range(event_loc_raw['qa_start_char'], event_loc_raw['qa_end_char']))
287 |         geo_ent_ranges = [set(range(i['start_char'], i['end_char'])) for i in event['geolocated_ents']]
288 |         # calculate intersection-over-union/Jaccard
289 |         ious = np.array([len(event_loc_chars.intersection(i)) / len(event_loc_chars.union(i)) for i in geo_ent_ranges])
290 |         if len(ious) == 0:
291 |             event['event_geolocation'] = {"reason": f"No geolocated entities",
292 |                                               "geo": None}
293 |             return event
294 |         try:
295 |             if np.max(ious) < geo_overlap_thresh:
296 |                 event['event_geolocation'] = {"reason": f"Attribute placename ({event_loc_raw['text']}) [doesn't overlap enough with any placenames: {str(np.max(ious))}",
297 |                                               "geo": None}
298 |                 return event
299 |         except ValueError:
300 |             event['event_geolocation'] = {"reason": f"Problem with intersection-overlap vector. No elements?",
301 |                                               "geo": None}
302 |             return event
303 |         best_match = event['geolocated_ents'][np.argmax(ious)]
304 |         if not best_match:
305 |             event['event_geolocation'] = {"reason": f"No 'best_match' geolocated entity",
306 |                                               "geo": None}
307 |             return event
308 |         best_match['event_location_overlap_score'] = float(np.max(ious))
309 |         if 'score' not in best_match.keys():
310 |             event['event_geolocation'] = {"reason": f"'best_match' identified but no 'score' key. Returning best_match anyway",
311 |                                         "geo": best_match} 
312 |             return event
313 |         if best_match['score'] > self.geo_threshold:
314 |             event['event_geolocation'] = {"reason": f": Successful overlap between attribute placename and one of the geoparser results",
315 |                                         "geo": best_match}
316 |             return event
317 |         else:
318 |             event['event_geolocation'] = {"reason": f": Successful overlap between attribute placename and one of the geoparser results BUT geoparser score was too low ({best_match['score']})",
319 |                                         "geo": None}
320 |             return event
321 | 
322 | 
323 | 
324 | 
325 |     def add_meta(self, event):
326 |         """
327 |         Add optional metadata to the event dictionary (e.g. alternative country codes, country names,
328 |         event intensity, event quad class, etc.)
329 |         """
330 |         for k, att in event['attributes'].items():
331 |             # add stuff to actors and recipients
332 |             if k in ["LOC", "DATE"]:
333 |                 continue
334 |             for v in att:
335 |                 try:
336 |                     v['country_name'] = self.iso_to_name[v['country']]
337 |                 except:
338 |                     print(v['country'])
339 |                     v['country_name'] = ""
340 | 
341 |         return event
342 | 
343 | 
344 |     def process(self, event_list, return_raw=False):
345 |         """
346 |         Create and write out a final cleaned dictionary/JSON file of events.
347 | 
348 |         Parameters
349 |         ----------
350 |         event_list: list of dicts
351 |           list of events after being passed through each of the processing steps
352 |         return_raw: bool
353 |           If true, don't write to a final and instead return the final version. Useful for 
354 |           debugging. Defaults to False.
355 |         """
356 |         for n, event in enumerate(event_list):
357 |             #if n == 0:
358 |             #    print(e)
359 |             event = self.find_event_loc(event)
360 |             event = self.add_meta(event)
361 |             try:
362 |                 event = resolve_date(event)
363 |             except Exception as exception:
364 |                 logger.warning(f"{exception} parsing date for event number {n}")
365 |         if return_raw:
366 |             return event_list
367 |         else:
368 |             with jsonlines.open("events_processed.jsonl", "w") as f:
369 |                 f.write_all(event_list)
370 | 
371 | 


--------------------------------------------------------------------------------
/NGEC/geolocation.py:
--------------------------------------------------------------------------------
  1 | from mordecai3 import Geoparser
  2 | from rich.progress import track
  3 | from rich import print
  4 | import time
  5 | import jsonlines
  6 | import pandas as pd
  7 | import os
  8 | 
  9 | import logging
 10 | logger = logging.getLogger(__name__)
 11 | logger.addHandler(logging.NullHandler())
 12 | 
 13 | def country_name_dict(base_path):
 14 |     file = os.path.join(base_path, "countries.csv")
 15 |     countries = pd.read_csv(file)
 16 |     country_name_dict = {i:j for i, j in zip(countries['CCA3'], countries['Name'])}
 17 |     country_name_dict.update({"": ""})
 18 |     country_name_dict.update({"IGO": "Intergovernmental Organization"})
 19 |     return country_name_dict
 20 | 
 21 | 
 22 | class GeolocationModel:
 23 |     def __init__(self, 
 24 |                 geo_model="/Users/ahalterman/MIT/Geolocation/mordecai3_scratch/mordecai3/mordecai_new.pt",
 25 |                 nlp=None, 
 26 |                 base_path = "NGEC/assets/",
 27 |                 geo_path = "../mordecai3/mordecai3/assets/",
 28 |                 save_intermediate=False,
 29 |                 quiet=False):
 30 |         self.geo = Geoparser(geo_model, 
 31 |                             geo_asset_path=geo_path,
 32 |                             nlp=nlp,
 33 |                             event_geoparse=False, 
 34 |                             trim=True, 
 35 |                             debug=False)
 36 |         self.quiet = quiet
 37 |         self.save_intermediate = save_intermediate
 38 |         self.iso_to_name = country_name_dict(base_path)
 39 | 
 40 | 
 41 |     def process(self, story_list, doc_list):
 42 |         """
 43 |         Wrap the Mordecai3 geoparser function.
 44 | 
 45 |         Parameters
 46 |         --------
 47 |         story_list: list of story dicts. See example
 48 |         doc_list: list of spaCy docs
 49 |         
 50 |         Example
 51 |         ------
 52 |         event = {'id': '20190801-2227-8b13212ac6f6', 
 53 |                 'date': '2019-08-01', 
 54 |                 'event_type': ['SANCTION', 'PROTEST'], 
 55 |                 'event_mode': [], 
 56 |                 'event_text': 'The Liberal Party, the largest opposition in Paraguay, announced in the evening of Wednesday the decision to submit an application of impeachment against the president of the country, Mario Abdo Benítez, and vice-president Hugo Velázquez, by polemical agreement with Brazil on the purchase of energy produced in Itaipu. According to the president of the Liberal Party, Efraín Alegre, the opposition also come tomorrow with penal action against all those involved in the negotiations of the agreement with Brazil, signed on confidentiality in May and criticized for being detrimental to the interests of the country. The Liberal Party has the support of the front Guasú, Senator and former President Fernando Lugo, he himself target of an impeachment, decided in less than 24 hours, in June 2012. According to legend, the reasons for the opening of the proceedings against Abdo Benítez are bad performance of functions, betrayal of the homeland and trafficking of influence. Alegre also announced the convocation of demonstrations throughout the country on Friday. ', 
 57 |                 'story_id': 'EFESP00020190801ef8100001:50066618', 
 58 |                 'publisher': 'translateme2-pt', 
 59 |                 'headline': '\nOposição confirma que pedirá impeachment de presidente do Paraguai; PARAGUAI GOVERNO (Pauta)\n', 
 60 |                 'pub_date': '2019-08-01', 'contexts': ['corruption'], 
 61 |                 'version': 'NGEC_coder-Vers001-b1-Run-001', 
 62 |                 'attributes': {'ACTOR': {'text': 'Mario Abdo Benítez', 'score': 0.1976235955953598}, 
 63 |                                 'RECIP': {'text': 'Fernando Lugo', 'score': 0.10433810204267502}, 
 64 |                                 'LOC': {'text': 'Paraguay', 'score': 0.24138706922531128}}}
 65 |         gp.process([event])
 66 |         """
 67 |         if len(doc_list) != len(story_list):
 68 |             raise ValueError(f"story_list length does not match spaCy doc list len: {len(story_list)} vs. {len(doc_list)}.")
 69 | 
 70 |         for n, story in track(enumerate(story_list), total=len(story_list), description="Geoparsing stories..."):
 71 |             doc = doc_list[n]
 72 |             res = self.geo.geoparse_doc(doc)
 73 |             for r in res['geolocated_ents']:
 74 |                 try:
 75 |                     r['country_name'] = self.iso_to_name[r['country_code3']]
 76 |                 except KeyError:
 77 |                     #logger.warning(f"Missing country code for {r}")
 78 |                     r['country_name'] = None
 79 |                 #if 'placename' not in r.keys(): 
 80 |                 #    print("'placename' key missing from geolocation results")
 81 |                 #    #print(r)
 82 |                 #    continue
 83 |                 #r['search_placename'] = r['placename']
 84 |                 #if 'resolved_placename' not in r.keys() and 'name' in r.keys():
 85 |                 #    r['resolved_placename'] = r['name']
 86 |                 #    del r['name']
 87 |                 if 'name' in r.keys():
 88 |                     r['resolved_placename'] = r['name']
 89 |                     del r['name']
 90 |             story['geolocated_ents'] = res['geolocated_ents']
 91 | 
 92 | 
 93 |         if self.save_intermediate:
 94 |             fn = time.strftime("%Y_%m_%d-%H") + "_geolocation_output.jsonl"
 95 |             with jsonlines.open(fn, "w") as f:
 96 |                 f.write_all(story_list)
 97 | 
 98 |         return story_list
 99 | 
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     #import streamlit as st
104 | 
105 |     #@st.cache(allow_output_mutation=True, suppress_st_warning=True)
106 |     #def make_ag():
107 |     #    ag = ActorResolver()
108 |     #    return ag
109 | 
110 |     #ag = make_ag()
111 | 
112 |     #query_text = st.text_input("Enter an actor string")
113 |     #query_date = st.text_input("Enter a date", "today")
114 | 
115 |     #best = ag.agent_to_code(query_text, query_date)
116 |     #st.write(best)
117 |     import jsonlines
118 | 
119 |     ag = ActorResolver()
120 |     with jsonlines.open("PLOVER_coding_201908_with_attr.jsonl", "r") as f:
121 |         data = list(f.iter())
122 | 
123 |     out = ag.process(data)
124 |     with jsonlines.open("PLOVER_coding_201908_with_actor.jsonl", "w") as f:
125 |         f.write_all(out)
126 |     


--------------------------------------------------------------------------------
/NGEC/mode_class.py:
--------------------------------------------------------------------------------
 1 | import sklearn
 2 | 
 3 | def _load_model(model_dir):
 4 |     """
 5 |     Load the mode classification models. 
 6 | 
 7 |     Parameters
 8 |     ----------
 9 |     model_dir: Path
10 |       path to the mode classification models
11 | 
12 |     Returns
13 |     ------
14 |     model_dict: dict
15 |       With event classes as keys and models as values.
16 |     """
17 |     raise NotImplementedError()
18 | 
19 | 
20 | class ModeClass:
21 |     def __init__(self, 
22 |                  model_dir="assets/mode_class_models/",
23 |                  threshold=0.6 # we can set stuff like this here
24 |                  ):
25 | 
26 |         self.model_dict = _load_model(model_dir)
27 |         self.threshold = threshold
28 |     
29 | 
30 |     def process(self, story_list):
31 |         """
32 |         Process a list of stories to detect the event class.
33 |         
34 |         Example
35 |         -------
36 |         The input is a list of dictionaries, each with an 'event_text' key with the full text of the story
37 |         and a 'event_type' key with a list of detected event types, e.g. ['SANCTION', 'MOBILIZE']
38 | 
39 |         {'date': '2019-08-01',
40 |          'event_type': ['SANCTION', 'MOBILIZE'],
41 |          'event_text': 'Indonesia is investigating a report that ... ',
42 |          'headline': 'Indonesia says it is probing a report of a ...',
43 |          'id': '<internal document id>',
44 |          'pub_date': '2019-08-01',
45 |          'publisher': '<publisher name>',
46 |          'story_id': '<recommended ID/url of original text>',
47 |          'version': '<optional version number>'} 
48 | 
49 |         Parameters
50 |         ----------
51 |         story_list: list of dicts
52 |           Each dictionary must have an 'event_text' key with the full text of the story and
53 |             an 'event_type' key with a list of detected event types.
54 |         
55 |         Returns
56 |         -------
57 |         story_list: list of dicts
58 |           Each story dictionary now contains an "event_mode" key with a list of detected modes (str). E.g.: 
59 |           'event_mode': ['SANCTION-withdraw']
60 | 
61 | 
62 |         """
63 |         raise NotImplementedError()
64 | 
65 |         for story in stories:
66 |             if event_text not in story.keys():
67 |                 raise ValueError("No 'event_text' key in input.")
68 |             if event_type not in story.keys():
69 |                 raise ValueError("Must have detected event types in input.")


--------------------------------------------------------------------------------
/NGEC/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__init__.py


--------------------------------------------------------------------------------
/NGEC/tests/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/NGEC/tests/__pycache__/conftest.cpython-39-pytest-7.0.1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__pycache__/conftest.cpython-39-pytest-7.0.1.pyc


--------------------------------------------------------------------------------
/NGEC/tests/__pycache__/test_actor_resolution.cpython-39-pytest-7.0.1.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/NGEC/tests/__pycache__/test_actor_resolution.cpython-39-pytest-7.0.1.pyc


--------------------------------------------------------------------------------
/NGEC/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from ..actor_resolution import ActorResolver
 2 | from ..formatter import Formatter
 3 | import pytest
 4 | import spacy
 5 | from NGEC import AttributeModel
 6 | 
 7 | @pytest.fixture(scope='session', autouse=True)
 8 | def ag():
 9 |     return ActorResolver(base_path="./assets/")
10 | 
11 | @pytest.fixture(scope='session', autouse=True)
12 | def nlp():
13 |     return spacy.load("en_core_web_trf")
14 | 
15 | @pytest.fixture(scope='session', autouse=True)
16 | def am():
17 |     return AttributeModel(model_dir = "./assets/PROP-SQuAD-trained-tinybert-6l-768d-squad2220302-1457",
18 |                     expand_actors=True,
19 |                     silent=False)
20 | 
21 | 


--------------------------------------------------------------------------------
/NGEC/tests/test_attribute_model.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import datetime
3 | 
4 | def test_nat1(am):
5 |     pass


--------------------------------------------------------------------------------
/NGEC/tests/test_formatter.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from ..formatter import resolve_date
3 | 
4 | def test_resolution():
5 |     event = {"pub_date": "June 20, 2012",
6 |             "attributes": {"DATE": [{"text": "last Sunday"}]}}
7 |     resolve_date(event)


--------------------------------------------------------------------------------
/NGEC/tests/test_multiple_actors.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | 
  4 | def make_example(text, actor_phrase, nlp):
  5 |     doc = nlp(text)
  6 |     match = re.search(actor_phrase, text)
  7 |     qa = {'text': actor_phrase,
  8 |          'qa_score': 0.4408265948295593,
  9 |          'qa_start_char': match.span()[0],
 10 |          'qa_end_char': match.span()[1]}
 11 |     return doc, qa
 12 | 
 13 | def test_1(am, nlp):
 14 |     # simple split, both actors present in answer span
 15 |     text = "Ukrainian forces carried out airstrikes against Russians and Belorussians"
 16 |     actor_phrase = "Russians and Belorussians"
 17 |     doc, qa = make_example(text, actor_phrase, nlp)
 18 |     actors = am.find_co_actors(qa, doc)
 19 |     assert set([i['text'] for i in actors]) == set(["Russians", "Belorussians"])
 20 | 
 21 | def test_2(am, nlp):
 22 |     # amod split, both actors present in answer span
 23 |     text = "Ukrainian forces carried out airstrikes against Russian and Belorussian soldiers"
 24 |     actor_phrase = "Russian and Belorussian soldiers"
 25 |     doc, qa = make_example(text, actor_phrase, nlp)
 26 |     actors = am.find_co_actors(qa, doc)
 27 |     assert set([i['text'] for i in actors]) == set(["Belorussian soldiers", "Russian soldiers"])
 28 | 
 29 | def test_3(am, nlp):
 30 |     # simple split, only one actor present in answer span
 31 |     text = "Ukrainian forces carried out airstrikes against Russians and Belorussians"
 32 |     actor_phrase = "Russians"
 33 |     doc, qa = make_example(text, actor_phrase, nlp)
 34 |     actors = am.find_co_actors(qa, doc)
 35 |     assert set([i['text'] for i in actors]) == set(["Russians", "Belorussians"])
 36 | 
 37 | def test_3(am, nlp):
 38 |     # simple split, only the second actor present in answer span
 39 |     text = "Ukrainian forces carried out airstrikes against Russians and Belorussians"
 40 |     actor_phrase = "Belorussians"
 41 |     doc, qa = make_example(text, actor_phrase, nlp)
 42 |     actors = am.find_co_actors(qa, doc)
 43 |     assert set([i['text'] for i in actors]) == set(["Belorussians", "Russians"])
 44 | 
 45 | def test_4(am, nlp):
 46 |     # amod split, only the second actor present in answer span
 47 |     text = "Ukrainian forces carried out airstrikes against Russian and Belorussian soldiers"
 48 |     actor_phrase = "Belorussian soldiers"
 49 |     doc, qa = make_example(text, actor_phrase, nlp)
 50 |     actors = am.find_co_actors(qa, doc)
 51 |     assert set([i['text'] for i in actors]) == set(["Russian soldiers", "Belorussian soldiers"])
 52 | 
 53 | def test_5(am, nlp):
 54 |     # amod, no second actor present in answer span
 55 |     text = "Ukrainian forces carried out airstrikes against Russian soldiers"
 56 |     actor_phrase = "Russian soldiers"
 57 |     doc, qa = make_example(text, actor_phrase, nlp)
 58 |     actors = am.find_co_actors(qa, doc)
 59 |     assert set([i['text'] for i in actors]) == set(["Russian soldiers"])
 60 | 
 61 | def test_6(am, nlp):
 62 |     # long list
 63 |     text = "Japan, the United States, Australia and India got together in New York in September last year for the first time."
 64 |     actor_phrase = "Japan"
 65 |     doc, qa = make_example(text, actor_phrase, nlp)
 66 |     actors = am.find_co_actors(qa, doc)
 67 |     assert set([i['text'] for i in actors]) == set(['Japan', 'United States', 'Australia', 'India'])
 68 | 
 69 | def test_7(am, nlp):
 70 |     # two actors, full titles
 71 |     text = "Russian President Vladimir Putin and British Prime Minister Boris Johnson will meet in Geneva next week."
 72 |     actor_phrase = "Vladimir Putin"
 73 |     doc, qa = make_example(text, actor_phrase, nlp)
 74 |     actors = am.find_co_actors(qa, doc)
 75 |     assert set([i['text'] for i in actors]) == set(['Russian President Vladimir Putin', 'British Prime Minister Boris Johnson'])
 76 | 
 77 | def test_8(am, nlp):
 78 |     text = "U.S. national security adviser Robert O'Brien said Friday he will hold talks with his counterparts from Japan, Australia, and India in Hawaii in October."
 79 |     actor_phrase = "Japan"
 80 |     doc, qa = make_example(text, actor_phrase, nlp)
 81 |     actors = am.find_co_actors(qa, doc)
 82 |     assert set([i['text'] for i in actors]) == set(['Australia', 'Japan', 'India'])
 83 | 
 84 | def test_9(am, nlp):
 85 |     # Checks that we aren't picking up appostive clauses that aren't compound lists
 86 |     text = "Og Fernandes, rapporteur of Operation Faroeste, revoked the house arrest of Sandra Inês Rusciolelli, the first judge to sign a plea bargaining agreement in Brazil."
 87 |     actor_phrase = "Sandra Inês Rusciolelli"
 88 |     doc, qa = make_example(text, actor_phrase, nlp)
 89 |     actors = am.find_co_actors(qa, doc)
 90 |     assert set([i['text'] for i in actors]) == set(["Sandra Inês Rusciolelli"])
 91 | 
 92 | 
 93 | def test_10(am, nlp):
 94 |     text = "According to a statement published on its website, Putin and Johnson discussed climate issues in light of of the forthcoming UN climate change conference COP26 and leaders ' summit in Glasgow."
 95 |     actor_phrase = "Putin"
 96 |     doc, qa = make_example(text, actor_phrase, nlp)
 97 |     actors = am.find_co_actors(qa, doc)
 98 |     assert set([i['text'] for i in actors]) == set(['Putin', 'Johnson'])
 99 | 
100 | def test_11(am, nlp):
101 |     # Actors follow an introductory clause
102 |     text = "According to a statement published on its website, Putin and Johnson discussed climate issues in light of of the forthcoming UN climate change conference COP26 and leaders ' summit in Glasgow."
103 |     actor_phrase = "Putin and Johnson"
104 |     doc, qa = make_example(text, actor_phrase, nlp)
105 |     actors = am.find_co_actors(qa, doc)
106 |     assert set([i['text'] for i in actors]) == set(['Putin', 'Johnson'])
107 | 
108 | def test_12(am, nlp):
109 |     text = "Qasr al-Nil Misdemeanor Court earlier cleared 28 arrested suspects and another 24 fugitives over accusations of protesting without prior permission."
110 |     actor_phrase = "arrested suspects"
111 |     doc, qa = make_example(text, actor_phrase, nlp)
112 |     actors = am.find_co_actors(qa, doc)
113 | 
114 | def test_13(am, nlp):
115 |     text = "Last month Russia and Turkish Foreign Minister Mevlut Cavusoglu both accused Iran of trying to destabilise Syria and Iraq and of sectarianism, prompting Tehran to summon Ankara's ambassador."
116 |     actor_phrase = "Russia"
117 |     doc, qa = make_example(text, actor_phrase, nlp)
118 |     actors = am.find_co_actors(qa, doc)
119 |     assert set([i['text'] for i in actors]) == set(['Russia', 'Turkish Foreign Minister Mevlut Cavusoglu'])
120 | 
121 | def test_14(am, nlp):
122 |     text = "\"Dine Ak Diamono\" talk show hosted by Moustapha Diop, with Bassirou Ngom, lawyer and member of the Alliance for the Republic; Barrister Babacar Ba, leader of the civil society organization known as \"Forum du Justiciable;\" Alassane Kitane, teacher of Philosophy at Amary Ndack Seck High School in Thies; and Oumar Faye of the movement Leeral Askanwi [Enlightening People], as guests - live from studio [Diop] Good evening viewers and thank you for your fidelity to the \"Dine Ak Diamono\" talk show."
123 | 
124 | def test_15(am, nlp):
125 |     text = "The Liberal Party, the largest opposition in Paraguay, announced in the evening of Wednesday the decision to submit an application of impeachment against the president of the country, Mario Abdo Benítez, and vice-president Hugo Velázquez, by polemical agreement with Brazil on the purchase of energy produced in Itaipu."
126 |     actor_phrase = "Velázquez"
127 |     doc, qa = make_example(text, actor_phrase, nlp)
128 |     actors = am.find_co_actors(qa, doc)
129 | 
130 | def test_16(am, nlp):
131 |     text = "The Liberal Party, the largest opposition in Paraguay, announced in the evening of Wednesday the decision to submit an application of impeachment against the president of the country, Mario Abdo Benítez, and vice Hugo Velázquez, by polemical agreement with Brazil on the purchase of energy produced in Itaipu."
132 |     actor_phrase = "president"
133 |     doc, qa = make_example(text, actor_phrase, nlp)
134 |     am.expand_actor(qa, doc)
135 | 
136 | def test_17(am, nlp):
137 |     # Actors follow an introductory clause
138 |     text = "The leaders of Germany, France, and the UK met in light of the forthcoming UN climate change conference COP26 and leaders ' summit in Glasgow."
139 |     actor_phrase = "UK"
140 |     doc, qa = make_example(text, actor_phrase, nlp)
141 |     actors = am.find_co_actors(qa, doc)
142 |     assert set([i['text'] for i in actors]) == set(['Germany', 'France', 'UK'])
143 | 


--------------------------------------------------------------------------------
/NGEC/utilities.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from spacy.tokens import Token
 3 | from spacy.language import Language
 4 | import numpy as np
 5 | import re
 6 | 
 7 | import logging
 8 | logger = logging.getLogger(__name__)
 9 | logger.addHandler(logging.NullHandler())
10 | 
11 | 
12 | def spacy_doc_setup():
13 |     try:
14 |         Token.set_extension('tensor', default=False)
15 |     except ValueError:
16 |         pass
17 |     try:
18 |         @Language.component("token_tensors")
19 |         def token_tensors(doc):
20 |             tensors = doc._.trf_data.last_hidden_layer_state
21 |             for n, d in enumerate(doc):
22 |                 if tensors[n]:
23 |                     d._.set('tensor', tensors[n])
24 |                 else:
25 |                     d._.set('tensor',  np.zeros(tensors[0].shape[-1]))
26 |             return doc
27 |     except ValueError:
28 |         pass
29 | 
30 | ### TESTING ###
31 | ### Comment this out and run to verify that the new 3.7+ version of spaCy works
32 | #import spacy
33 | #nlp = spacy.load("en_core_web_trf")
34 | #spacy_doc_setup()
35 | #nlp.add_pipe("token_tensors")
36 | #
37 | #doc = nlp("We visited Berlin and Alexanderplatz.")
38 | #doc[3]._.tensor
39 | ####
40 | 
41 | def stories_to_events(story_list, doc_list=None):
42 |     if not doc_list:
43 |         logger.warning("Missing doc list...")
44 |     if doc_list:
45 |         if len(doc_list) != len(story_list):
46 |             raise ValueError("the story list and list of spaCy docs must be the same length")
47 |         for n, story in enumerate(story_list):
48 |             doc = doc_list[n]
49 |             story['story_people'] = list(set([i.text for i in doc.ents if i.label_ == "PERSON"]))
50 |             story['story_organizations'] = list(set([i.text for i in doc.ents if i.label_ == "ORG"]))
51 |             story['story_places'] = list(set([i.text for i in doc.ents if i.label_ in ["GPE", "LOC", "FAC"]]))
52 |             story['_doc_position'] = n
53 |     # "lengthen" the story-level data to generate a separate element
54 |     # for each event type
55 |     event_list = []
56 |     for n, ex in enumerate(story_list):
57 |         # event modes are formatted ["ACCUSE-disapprove", "ACCUSE-allege", "CONSULT-third-party"]
58 |         modes = [i.split("-") for i in ex['event_mode']]
59 |         events_with_modes = list(set([i[0] if i else None for i in modes]))
60 |         for event_type in ex['event_type']:
61 |             if event_type not in events_with_modes:
62 |                 event_mode = ""
63 |                 d = ex.copy() # note: the copy is important!
64 |                 d['event_type'] = event_type
65 |                 d['orig_id'] = d['id']
66 |                 d['event_mode'] = event_mode
67 |                 d['id'] = d['id'] + "_" + event_type + "_" # generate a new ID
68 |                 event_list.append(d)
69 |             else:
70 |                 for et, *event_mode in modes:
71 |                     # annoyingly, the event and mode are separated by a hyphen, but
72 |                     # there are also hyphens within certain mode names. Merge those back
73 |                     # together
74 |                     event_mode = '-'.join([*event_mode])
75 |                     if et != event_type:
76 |                         # skip modes that are attached to the wrong event type
77 |                         continue
78 |                     d = ex.copy() # note: the copy is important!
79 |                     d['event_type'] = event_type
80 |                     d['orig_id'] = d['id']
81 |                     d['event_mode'] = event_mode
82 |                     d['id'] = d['id'] + "_" + event_type + "_" + event_mode # generate a new ID
83 |                     event_list.append(d)
84 |     return event_list


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NGEC -- Next generation political event coder
  2 | 
  3 | This repository contains the code for the Next Generation Event Coder (NGEC), a
  4 | Python library for extracting event data from news text. The pipeline works out-of-the-box
  5 | to code events using the [PLOVER event ontology](https://osf.io/preprints/socarxiv/rm5dw/), but can 
  6 | be easily customized to produce events with a custom ontology.
  7 | 
  8 | It accompanies the working paper, ["Creating Custom Event Data Without Dictionaries: A Bag-of-Tricks"](https://arxiv.org/pdf/2304.01331.pdf).
  9 | 
 10 | ## Overview
 11 | 
 12 | We break the problem of event extraction into six steps:
 13 | 
 14 | 1. Event classification: identify the event described in a document (e.g., PROTEST, ASSAULT, AGREE,...) using a transformer classifier trained on new data.
 15 | 2. Sub-event (``mode'') classification: identify a more specific event type (e.g., PROTEST-riot, ASSAULT-aerial), also using a transformer-based classifier.
 16 | 3. Context classification: identify themes or topics in a document (e.g., "human rights", "environment") using a classifier.
 17 | 4. Event attribute identification: identifying the spans of text that report who carried out the event, who it was directed against, where it occurred, etc. We do this with a fine-tuned question-answering model trained on newly annotated text.
 18 | 5. Actor, location, and date resolution: we resolve extracted named actors and recipients to their Wikipedia page using an offline Wikipedia index and a custom neural similarity model.
 19 | 6. Entity categorization: Finally, we map the actor to their country and their "sector" code as defined by the PLOVER ontology (e.g., "GOV", "MIL", etc.)
 20 | 
 21 | ![](docs/pipeline_figure.png)
 22 | 
 23 | Currently, this processing pipeline only performs the following steps:
 24 | 
 25 | *Note*: This repo has basic pretrained models for event detection, but does *not*
 26 | currently include context and mode models.
 27 | 
 28 | ## Running
 29 | 
 30 | The main script is `ngec_process.py`.
 31 | 
 32 | ```
 33 | python ngec_process.py
 34 | 
 35 | usage: ngec_process.py [-h] [-m -1] [-a NGEC/assets/PROP-SQuAD-trained-tinybert-6l-768d-squad2220302-1457] [-b NGEC/assets/]
 36 |                        [-g ../mordecai3/mordecai_new.pt]
 37 |                        [input_file]
 38 | 
 39 | positional arguments:
 40 |   input_file            JSONL input file. At a minimum, this should have keys for "id", "date", and
 41 |                         "event_text"
 42 | 
 43 | options:
 44 |   -h, --help            show this help message and exit
 45 |   -m, --max-stories -1
 46 |                         Max stories to code. -1 is all stories
 47 |   -a, --attribute-dir NGEC/assets/PROP-SQuAD-trained-tinybert-6l-768d-squad2220302-1457
 48 |                         Location of the QA attribute model
 49 |   -b, --base-path NGEC/assets/
 50 |                         Location of the other models and files
 51 |   -g, --geo-model ../mordecai3/mordecai_new.pt
 52 |                         Location of the geolocation model
 53 | ```
 54 | 
 55 | 
 56 | <details>
 57 |   <summary>Click to view example input</summary>
 58 | 
 59 | ```
 60 | {
 61 |   "id": "20190801-2227-8b13212ac6f6",
 62 |   "date": "2019-08-01",
 63 |   "event_type": [
 64 |     "ACCUSE",
 65 |     "REJECT",
 66 |     "THREATEN",
 67 |     "SANCTION"
 68 |   ],
 69 |   "event_mode": [],
 70 |   "event_text": "The Liberal Party, the largest opposition in Paraguay, .... ",
 71 |   "story_id": "EFESP00020190801ef8100001:50066618",
 72 |   "publisher": "translateme2-pt",
 73 |   "headline": "\nOposição confirma q...",
 74 |   "pub_date": "2019-08-01",
 75 |   "contexts": [
 76 |     "corruption"
 77 |   ],
 78 |   "version": "NGEC_coder-Vers001-b1-Run-001"
 79 | }
 80 | ```
 81 | </details>
 82 | 
 83 | 
 84 | ## Quick start
 85 | 
 86 | First, create a new Conda environment and install the required libraries:
 87 | 
 88 | ```
 89 | conda create -y --name ngec python=3.10
 90 | conda activate ngec
 91 | 
 92 | pip install spacy, textacy sentence-transformers
 93 | python -m spacy download en_core_web_trf
 94 | pip install elasticsearch elasticsearch_dsl unidecode dateparser
 95 | pip install jsonlines tqdm datasets rich plac 
 96 | pip install mordecai3
 97 | ```
 98 | 
 99 | Next, set up an Elasticsearch server with an offline Wikipedia and Geonames.
100 | Download the pre-built index and start an Elasticsearch instance with the pre-built
101 | index (the code below assumes you have Docker installed).
102 | 
103 | ```
104 | # Download a pre-built index from my website:
105 | wget https://andrewhalterman.com/files/geonames_wiki_index_2023-03-02.tar.gz
106 | # uncompress it to produce a directory called `geonames_index` (note that this includes both geonames *and* Wiki)
107 | tar -xvzf geonames_wiki_index_2023-03-02.tar.gz
108 | # You may need to set write permissions for Docker to run
109 | # chmod -R 777 ./geonames_index/
110 | # Then start an Elasticsearch instance in Docker with the uncompressed index as a volume.
111 | # Later versions of Elasticsearch have not been tested.
112 | sudo docker run -d -p 127.0.0.1:9200:9200 -e "discovery.type=single-node" -v ./geonames_index/:/usr/share/elasticsearch/data elasticsearch:7.10.1
113 | ```
114 | 
115 | If you want to build these indices from scratch, see the detailed instructions for [creating an offline Wikipedia index](https://github.com/ahalterman/NGEC/tree/main/setup/wiki) and [setting up offline Geonames in Elasticsearch](https://github.com/openeventdata/es-geonames).
116 | 
117 | ## Note on the models
118 | 
119 | Because of conditions imposed by our funder and the proprietary data used in the project, we cannot share the training data or the trained event, mode, and context models used to produce the POLECAT event dataset. However, we provide example code for training classifiers on your own data in the [setup](https://github.com/ahalterman/NGEC/tree/main/setup/train_classifiers) directory. We also provide demonstration pretrained models for the event categories used in the POLECAT dataset that draw on a corpus of pseudo-labeled synthetic news stories using an approach described in [Halterman (2023)](https://arxiv.org/abs/2303.16028). These classifiers are not as accurate as the ones used in the POLECAT dataset, but work pretty well and could easily be improved with additional training data. 
120 | 
121 | ## Citing
122 | 
123 | The steps that this pipeline implements are described in more detail in the [paper](https://arxiv.org/pdf/2304.01331.pdf). If you use the pipeline or the techniques we introduce, please cite the following:
124 | 
125 | ```
126 | @article{halterman_et_al2023creating,
127 |   title={Creating Custom Event Data Without Dictionaries: A Bag-of-Tricks},
128 |   author={Andrew Halterman and Philip A. Schrodt and Andreas Beger and Benjamin E. Bagozzi and Grace I. Scarborough},
129 |   journal={arXiv preprint arXiv:2304.01331},
130 |   year={2023}
131 | }
132 | ```
133 | 
134 | ## Acknowledgements
135 | 
136 | This research was sponsored by the Political Instability Task Force (PITF). The PITF is funded by
137 | the Central Intelligence Agency. The views expressed in this paper are the authors’ alone and do not
138 | represent the views of the U.S. Government.


--------------------------------------------------------------------------------
/examples/Guardian_SDF_sample.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/examples/Guardian_SDF_sample.csv.zip


--------------------------------------------------------------------------------
/examples/NGEC_pres.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/examples/NGEC_pres.pdf


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | This directory contains some examples for working with the component parts of the NGEC pipeline.
 4 | 
 5 | Currently, it includes:
 6 | 
 7 | - `demo_wiki_resolution.py`: working with Wikipedia lookups and actor coding
 8 | - `demo_mordecai.py`: a simple demo showing the functionality of the geoparser
 9 | 
10 | For information on training your own event, mode, and context models, see the `setup` directory.


--------------------------------------------------------------------------------
/examples/demo_mordecai.py:
--------------------------------------------------------------------------------
 1 | # Make sure you've installed Mordecai, which is in a separate package.
 2 | # E.g, pip install mordecai3
 3 | from mordecai3 import Geoparser
 4 | from pprint import pprint
 5 | 
 6 | # Create the Geoparser object
 7 | # Make sure the path to the model is correct
 8 | geo = Geoparser("NGEC/assets/mordecai_2023-03-28.pt")
 9 | 
10 | output = geo.geoparse_doc("The Mexican government sent 300 National Guard troopers to bolster the southern state of Guerrero on Tuesday, where a local police chief and 12 officers were shot dead in a brutal ambush the day before.")
11 | 
12 | pprint(output)
13 | #{'doc_text': 'The Mexican government sent 300 National Guard troopers to '
14 | #             'bolster the southern state of Guerrero on Tuesday, where a local '
15 | #             'police chief and 12 officers were shot dead in a brutal ambush '
16 | #             'the day before.',
17 | # 'event_location_raw': '',
18 | # 'geolocated_ents': [{'admin1_code': '12',
19 | #                      'admin1_name': 'Guerrero',
20 | #                      'admin2_code': '',
21 | #                      'admin2_name': '',
22 | #                      'city_id': '',
23 | #                      'city_name': '',
24 | #                      'country_code3': 'MEX',
25 | #                      'end_char': 97,
26 | #                      'feature_class': 'A',
27 | #                      'feature_code': 'ADM1',
28 | #                      'geonameid': '3527213',
29 | #                      'lat': 17.66667,
30 | #                      'lon': -100.0,
31 | #                      'name': 'Estado de Guerrero',
32 | #                      'score': 1.0,
33 | #                      'search_name': 'Guerrero',
34 | #                      'start_char': 89}]}
35 | 


--------------------------------------------------------------------------------
/examples/demo_wiki_resolution.py:
--------------------------------------------------------------------------------
  1 | import jsonlines
  2 | import spacy
  3 | from NGEC import ActorResolver
  4 | from tqdm import tqdm
  5 | from pprint import pprint
  6 | import pandas as pd
  7 | 
  8 | # NOTE: Make sure you have NGEC installed.
  9 | # From the main NGEC repo, install the requirements then run `pip install -e .`
 10 | # Also make sure you have the offline Wikipedia index installed.
 11 | # See https://github.com/ahalterman/NGEC#quick-start
 12 | 
 13 | # Change the logging levels--Elasticsearch is very verbose
 14 | import logging
 15 | logging.getLogger("NGEC.actor_resolution").setLevel(logging.WARNING)
 16 | 
 17 | es_logger = logging.getLogger('elasticsearch')
 18 | es_logger.setLevel(logging.WARNING)
 19 | 
 20 | # Load the spaCy model we'll use for named entity recognition
 21 | nlp = spacy.load("en_core_web_sm")
 22 | 
 23 | # Load the sample data
 24 | data = pd.read_csv("Guardian_SDF_sample.csv.zip", compression='zip')
 25 | 
 26 | # Instantiate the model.
 27 | # This assumes that you're running the code from the NGEC/docs directory.
 28 | actor_resolution_model = ActorResolver(spacy_model=nlp, 
 29 |                                            base_path="../NGEC/assets/", 
 30 |                                            save_intermediate=False, 
 31 |                                            gpu=False) # Set to True if you have a GPU
 32 | 
 33 | # Run spaCy over the docs
 34 | docs = list(nlp.pipe([i['text'] for i in data]))
 35 | 
 36 | # iterate through the docs, making a list of lists with the PERSON and ORG entities
 37 | entities = []
 38 | for doc in docs:
 39 |     for ent in doc.ents:
 40 |         if ent.label_ in ['PERSON', 'ORG']:
 41 |             ent_text = ent.text
 42 |             d = {"entity": ent_text,
 43 |                  # make sure to append the sentence text--we'll use this for context
 44 |                  "context": ent.sent.text}
 45 |             entities.append(d)
 46 | 
 47 | # Now iterate through the extracted entities and resolve them to Wikipedia
 48 | wikis = []
 49 | for ent in tqdm(entities):
 50 |     wiki = actor_resolution_model.query_wiki(ent['entity'], context = ent['context'])
 51 |     if not wiki:
 52 |         wiki = {"search_term": ent['entity'],
 53 |                 "title": None}
 54 |     else:
 55 |         wiki['search_term'] = ent['entity']
 56 |     wikis.append(wiki)
 57 | 
 58 | 
 59 | # Print out an example 50 results
 60 | for i in wikis[400:450]:
 61 |     try:
 62 |         short_desc = i['short_desc']
 63 |     except KeyError:
 64 |         short_desc = "None"
 65 |     print(f"{i['search_term']:<30} ---> {i['title']} ({short_desc})")
 66 | 
 67 | 
 68 | 
 69 | ## Example of categorizing actors using their linked Wikipedia pages
 70 | 
 71 | wiki = actor_resolution_model.query_wiki("Ben Rhodes", context = "The former Obama adviser Ben Rhodes said: “We all owe him our gratitude – he literally made us safer.”")
 72 | code = actor_resolution_model.wiki_to_code(wiki)
 73 | pprint(code)
 74 | 
 75 | wiki = actor_resolution_model.query_wiki("Niloufar Hamedi", context = "The two journalists are Niloufar Hamedi, who broke the news of Amini’s death for wearing her headscarf too loose, and Elaheh Mohammadi, who wrote about Amini’s funeral.")
 76 | code = actor_resolution_model.wiki_to_code(wiki)
 77 | pprint(code)
 78 | 
 79 | ### The code below lets you explore the output of the model in a little more detail. ###
 80 | 
 81 | ## Print out the full logs--this will give you more detail on how many
 82 | ## candidate wikipedia matches there are.
 83 | logging.getLogger("NGEC.actor_resolution").setLevel(logging.DEBUG)
 84 | 
 85 | # Experiment with upper/lower case, including/excluding context, etc.
 86 | actor_resolution_model.query_wiki("the International Rescue Committee")
 87 | actor_resolution_model.query_wiki("Isis")
 88 | actor_resolution_model.query_wiki("Isis", context="Fighting continues in Syria with the terrorist group Isis.")
 89 | actor_resolution_model.query_wiki("ISIS", context="Fighting continues in Syria with the terrorist group ISIS.")
 90 | actor_resolution_model.query_wiki("ISIS")
 91 | 
 92 | 
 93 | # Example where coding fails without context
 94 | sdf = actor_resolution_model.search_wiki("SDF", fuzziness=0)
 95 | 
 96 | 
 97 | ## Code to explore how the context similarity model works
 98 | 
 99 | from sentence_transformers import SentenceTransformer
100 | from sentence_transformers.util import cos_sim
101 | 
102 | def load_trf_model(model_dir='sentence-transformers/paraphrase-MiniLM-L6-v2'): ## Change to offline!!
103 |     model = SentenceTransformer(model_dir)
104 |     return model
105 | trf = load_trf_model()
106 | 
107 | doc = "The SDF (the Kurdish led force raised by Washington to fight Isis) and the United States are sitting on a volcano in north-east Syria, with tens of thousands of foreign fighters and families in cramped detention centres."
108 | encoded = trf.encode(doc)
109 | res = actor_resolution_model.search_wiki("SDF")
110 | 
111 | intro_paras = [i['intro_para'][0:200] for i in res]
112 | encoded_intros = trf.encode(intro_paras)
113 | 
114 | sims = cos_sim(encoded, encoded_intros)[0]
115 | res[sims.argmax()]


--------------------------------------------------------------------------------
/ngec_process.py:
--------------------------------------------------------------------------------
  1 | from NGEC import  AttributeModel
  2 | from NGEC import ActorResolver
  3 | from NGEC import GeolocationModel
  4 | from NGEC import Formatter
  5 | from NGEC import utilities
  6 | 
  7 | import spacy
  8 | from tqdm import tqdm
  9 | from rich import print
 10 | from rich.progress import track
 11 | import plac
 12 | from pathlib import Path
 13 | import re
 14 | 
 15 | import logging
 16 | from rich.logging import RichHandler
 17 | 
 18 | logger = logging.getLogger('main')
 19 | handler = RichHandler() 
 20 | #formatter = logging.Formatter(
 21 | #        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
 22 | #handler.setFormatter(formatter)
 23 | logger.addHandler(handler)
 24 | logger.setLevel(logging.INFO)
 25 | logger.propagate = False
 26 | 
 27 | loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
 28 | for i in loggers:
 29 |     if re.search("NGEC\.", i.name):
 30 |         i.addHandler(handler) 
 31 |         i.setLevel(logging.INFO)
 32 |         i.propagate = False
 33 |     if re.search("elasticsearch", i.name):
 34 |         i.addHandler(handler) 
 35 |         i.setLevel(logging.WARNING)
 36 | 
 37 | #loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
 38 | #print(loggers)
 39 | 
 40 | # we need to keep the raw tensors for each token
 41 | 
 42 | def load_nlp():
 43 |     nlp = spacy.load("en_core_web_trf")
 44 |     nlp.add_pipe("token_tensors")
 45 |     return nlp
 46 | 
 47 | 
 48 | def read_input(input_file="NGEC/PLOVER_coding_201908_220302-1049.jsonl", max_stories=10):
 49 |     import jsonlines
 50 |     """
 51 |     Read in Factiva stories and return a list of stories for processing
 52 | 
 53 |     TODO:
 54 |     - clean up new lines/whitespace at the beginning and end of headlines and stories
 55 |     - do Phil's dateline remover here?
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     ????: ????
 60 |       Probably from a file, but possibly from a DB
 61 | 
 62 |     Returns
 63 |     -------
 64 |     stories: list of dicts
 65 |       - text
 66 |       - title
 67 |       - publication
 68 |       - date
 69 |     """
 70 |     if max_stories > 0:
 71 |         logger.info(f"Limiting to the first {max_stories} stories.")
 72 |     with jsonlines.open(input_file, "r") as f:
 73 |         data = list(f.iter())
 74 |     return data[:max_stories]
 75 | 
 76 | @plac.pos('input_file', "JSONL input file with events, modes, and contexts")
 77 | @plac.opt('max_stories', "Max stories to code", type=int)
 78 | @plac.opt('attribute_dir', "Location of the QA attribute model", type=str)
 79 | @plac.opt('base_path', "Location of the other models and files", type=Path)
 80 | @plac.opt('save_intermediate', "Write output of each intermediate step?", type=bool)
 81 | @plac.opt('geo_model', "Location of the geolocation model", type=Path)
 82 | @plac.opt('gpu', "Set to True if GPU is available", abbrev='d', type=bool)
 83 | def ngec(input_file="NGEC/PLOVER_coding_201908_220302-1049.jsonl",
 84 |         max_stories=-1,
 85 |         attribute_dir="NGEC/assets/roberta-base-squad2_2022-08-02",
 86 |         base_path="NGEC/assets/",
 87 |         save_intermediate=False,
 88 |         expand_actors=True,
 89 |         geo_model="../mordecai3/mordecai_2023-02-07_good.pt",
 90 |         gpu=False):
 91 | 
 92 |     utilities.spacy_doc_setup()
 93 |     nlp = load_nlp()
 94 | 
 95 |     # Initialize the processing models/objects
 96 |     #event_model = EventClassModel()
 97 |     #context_model = ContextModel()
 98 |     #mode_model = ModeModel()
 99 |     logger.info("Loading geolocation model...")
100 |     geolocation_model = GeolocationModel(geo_model, 
101 |                                         geo_path = "../mordecai3/mordecai3/assets/",
102 |                                         save_intermediate=save_intermediate)
103 |     attribute_model = AttributeModel(attribute_dir, 
104 |                                     silent=True, 
105 |                                     gpu=gpu, 
106 |                                     save_intermediate=save_intermediate, 
107 |                                     expand_actors=expand_actors,
108 |                                     base_path=base_path)
109 |     actor_resolution_model = ActorResolver(spacy_model=nlp, 
110 |                                            base_path=base_path, 
111 |                                            save_intermediate=save_intermediate, 
112 |                                            gpu=gpu)
113 |     formatter = Formatter(base_path=base_path)
114 | 
115 |     # Read in the stories
116 |     story_list = read_input(input_file, max_stories)
117 | 
118 |     just_text = [i['event_text'] for i in story_list]
119 |     doc_list = list(track(nlp.pipe(just_text), total=len(just_text), description="nlping docs..."))
120 | 
121 |     #story_list = event_model.process(story_list)
122 |     #story_list = mode_model.process(story_list)
123 |     #story_list = context_model.process(story_list)
124 |     logger.info("Geolocating events...")
125 |     story_list = geolocation_model.process(story_list, doc_list)
126 | 
127 |     event_list = utilities.stories_to_events(story_list, doc_list)
128 |     logger.debug("Post-event split")
129 |     logger.debug(f"{event_list[0]}")
130 |     #event_list = mode_model(event_list)
131 | 
132 |     logger.info("Running attribute model...")
133 |     event_list = attribute_model.process(event_list, doc_list)
134 |     #print(event_list[0])
135 |     logger.info("Running actor resolution model...")
136 |     event_list = actor_resolution_model.process(event_list, doc_list)
137 |     #print(event_list[0])
138 | 
139 |     logger.info("Formatting results...")
140 |     cleaned_events = formatter.process(event_list)
141 |     logger.info("Completed processing.")
142 | 
143 | if __name__ == "__main__":
144 |     plac.call(ngec)


--------------------------------------------------------------------------------
/ngec_streamlit.py:
--------------------------------------------------------------------------------
  1 | from NGEC import EventClass
  2 | from NGEC import  AttributeModel
  3 | from NGEC import ActorResolver
  4 | from NGEC import GeolocationModel
  5 | from NGEC import Formatter
  6 | from NGEC import utilities
  7 | 
  8 | import streamlit as st
  9 | 
 10 | import spacy
 11 | import pandas as pd
 12 | 
 13 | # stuff that's just used to allow streamlit cacheing
 14 | import preshed
 15 | import cymem
 16 | import spacy_transformers
 17 | import thinc
 18 | 
 19 | st.markdown("## NGEC test interface")
 20 | 
 21 | st.markdown("Put in some story text to see what NGEC produces.")
 22 | st.markdown("The event classifier step uses the open source models that are trained on synthetic documents. The accuracy is not as good as the proprietary models used to produce the POLECAT dataset. To manually override the event classification, set the event type (and mode) on the sidebar.")
 23 | st.markdown("Intermediate output is also returned but hidden by default.")
 24 | 
 25 | #@st.cache(allow_output_mutation = True)
 26 | @st.cache_resource()
 27 | def load_nlp():
 28 |     utilities.spacy_doc_setup()
 29 |     nlp = spacy.load("en_core_web_trf")
 30 |     nlp.add_pipe("token_tensors")
 31 |     return nlp
 32 | 
 33 | nlp = load_nlp()
 34 | 
 35 | def format_output(cleaned_events):
 36 |     for event in cleaned_events:
 37 |         if 'ACTOR' in event['attributes'].keys() and event['attributes']['ACTOR']:
 38 |             actors = '; '.join([i['text'] for i in event['attributes']['ACTOR']])
 39 |             actor_codes = '; '.join([f"{i['country']} {i['code_1']}" for i in event['attributes']['ACTOR']])
 40 |             actor_wikis = '; '.join([i['wiki'] for i in event['attributes']['ACTOR']])
 41 |         else:
 42 |             actors = ""
 43 |             actor_codes = ""
 44 |             actor_wikis = ""
 45 |         if 'RECIP' in event['attributes'].keys() and event['attributes']['RECIP']:
 46 |             recipients = '; '.join([i['text'] for i in event['attributes']['RECIP']])
 47 |             recipient_codes = '; '.join([f"{i['country']} {i['code_1']}" for i in event['attributes']['RECIP']])
 48 |             recip_wikis = '; '.join([i['wiki'] for i in event['attributes']['RECIP']])
 49 |         else:
 50 |             recipients = ""
 51 |             recipient_codes = ""
 52 |             recip_wikis = ""
 53 |         if event['event_geolocation']['geo']:
 54 |             resolved_placename = event['event_geolocation']['geo']['resolved_placename']
 55 |             adm1 = event['event_geolocation']['geo']['admin1_name']
 56 |             country = event['event_geolocation']['geo']['country_name']
 57 |         else:
 58 |             resolved_placename = ""
 59 |             adm1 = ""
 60 |             country = ""
 61 |         #st.success(actors)
 62 |         d = {"Raw Actors": actors,
 63 |                 "Actor Codes": actor_codes,
 64 |                 "Actor Wikis": actor_wikis,
 65 |                 "Event Type": event['event_type'],
 66 |                 "Event Mode": event['event_mode'],
 67 |                 "Raw Recipients": recipients,
 68 |                 "Recipient Codes": recipient_codes,
 69 |                 "Recipient Wikis": recip_wikis,
 70 |                 "Resolved Placename": resolved_placename,
 71 |                 "Admin1": adm1,
 72 |                 "Country": country,
 73 |                 "Date": event['date_resolved']}
 74 |         df = pd.DataFrame(d, index=[0]).transpose()
 75 |         df = df.reset_index()
 76 |         df.columns = ["Attribute", "Value"]
 77 |         # disable row numbers
 78 |         df.index = [""] * len(df)
 79 |         st.table(df)
 80 | 
 81 | 
 82 | 
 83 | save_intermediate=False
 84 | attribute_dir="NGEC/assets/deberta_squadnewsqa_2023-05-22"
 85 | base_path="./NGEC/assets/"
 86 | save_intermediate=False
 87 | expand_actors=True
 88 | geo_model="/home/andy/projects/mordecai/mordecai3/assets/mordecai_2023-02-07_good.pt"
 89 | geo_path="/home/andy/projects/mordecai/mordecai3/assets/"
 90 | 
 91 | gpu=True
 92 | 
 93 | #@st.cache(allow_output_mutation = True)
 94 | @st.cache_resource()
 95 | def load_event_class():
 96 |     event_model = EventClass()
 97 |     return event_model
 98 | 
 99 | pub_date = st.sidebar.text_input("Publication date", "today")
100 | event_type = st.sidebar.text_input("Event type", "")
101 | event_mode = st.sidebar.text_input("Mode type", "")
102 | show_intermediate = st.sidebar.checkbox("Show intermediate output", False)
103 | event_model = load_event_class()
104 | 
105 | #@st.cache(allow_output_mutation = True)
106 | @st.cache_resource()
107 | def load_geo(save_intermediate=save_intermediate):
108 |     geolocation_model = GeolocationModel(geo_model, 
109 |                                          geo_path=geo_path,
110 |                                          save_intermediate=save_intermediate)
111 |     return geolocation_model
112 | 
113 | #@st.cache(allow_output_mutation = True)
114 | @st.cache_resource()
115 | def load_attr(attribute_dir=attribute_dir, silent=True, gpu=gpu, save_intermediate=save_intermediate, expand_actors=expand_actors,
116 |              base_path=base_path):
117 |     attribute_model = AttributeModel(attribute_dir,
118 |                                     silent=silent,
119 |                                     gpu=gpu,
120 |                                     save_intermediate=save_intermediate,
121 |                                     base_path=base_path,
122 |                                     expand_actors=expand_actors)
123 |     return attribute_model
124 | 
125 | 
126 | @st.cache_resource()
127 | def load_resolution(nlp=nlp, base_path=base_path, save_intermediate=save_intermediate, gpu=gpu):
128 |     actor_resolution_model = ActorResolver(spacy_model=nlp, base_path=base_path, save_intermediate=save_intermediate, gpu=gpu)
129 |     return actor_resolution_model
130 | 
131 | @st.cache_resource()
132 | def load_formatter(base_path=base_path):
133 |     formatter = Formatter(base_path=base_path)
134 |     return formatter
135 | 
136 | geolocation_model = load_geo()
137 | attribute_model = load_attr(base_path=base_path)
138 | actor_resolution_model = load_resolution()
139 | formatter = load_formatter()
140 | 
141 | text = st.text_area("Input text", "German troops withdrew from their area of operations in Kandahar last week.")
142 | 
143 | 
144 | 
145 | 
146 | if text:
147 |     doc_list = [nlp(text)]
148 | 
149 |     story_list = [{"event_text": text, "id": "123", "event_type": [event_type], "event_mode": [event_mode], "pub_date": pub_date}]
150 |     
151 |     if not event_type:
152 |         story_list = event_model.process(story_list)
153 |         if show_intermediate:
154 |             with st.expander("Show event class step output", expanded=False):
155 |                 st.write(story_list)
156 |     if not story_list[0]['event_type']:
157 |         st.error("No event type detected.")
158 |         st.stop()
159 |     story_list = geolocation_model.process(story_list, doc_list)
160 | 
161 |     event_list = utilities.stories_to_events(story_list, doc_list)
162 | 
163 |     if show_intermediate:
164 |         with st.expander("Show geolocation step output", expanded=False):
165 |             st.write(event_list)
166 | 
167 |     event_list = attribute_model.process(event_list, doc_list)
168 |     if show_intermediate:
169 |         with st.expander("Show attribute step output", expanded=False):
170 |             st.write(event_list)
171 | 
172 |     event_list = actor_resolution_model.process(event_list)
173 |     if show_intermediate:
174 |         with st.expander("Show actor resolution step output", expanded=False):
175 |             st.write(event_list)
176 | 
177 |     st.markdown("### Final output")
178 |     cleaned_events = formatter.process(event_list, return_raw=True)
179 |     
180 |     st.markdown(text)
181 |     format_output(cleaned_events)
182 | 
183 |     with st.expander("Show raw final output", expanded=False):
184 |         st.write(cleaned_events)
185 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # conda install -c huggingface tokenizers
 2 | aiohttp==3.8.1
 3 | aiosignal==1.2.0
 4 | async-timeout==4.0.2
 5 | attrs==21.4.0
 6 | blis==0.7.6
 7 | cachetools==5.0.0
 8 | catalogue==2.0.6
 9 | certifi==2021.10.8
10 | charset-normalizer==2.0.12
11 | click==8.0.4
12 | cymem==2.0.6
13 | cytoolz==0.11.2
14 | datasets==1.18.4
15 | dateparser==1.1.0
16 | dill==0.3.4
17 | filelock==3.6.0
18 | frozenlist==1.3.0
19 | fsspec==2022.2.0
20 | huggingface-hub==0.4.0
21 | idna==3.3
22 | jellyfish==0.9.0
23 | Jinja2==3.0.3
24 | joblib==1.1.0
25 | langcodes==3.3.0
26 | MarkupSafe==2.1.0
27 | mordecai3 @ file:///Users/ahalterman/MIT/Geolocation/mordecai3_scratch
28 | multidict==6.0.2
29 | multiprocess==0.70.12.2
30 | murmurhash==1.0.6
31 | networkx==2.7.1
32 | nltk==3.7
33 | numpy==1.22.3
34 | packaging==21.3
35 | pandas==1.4.1
36 | pathy==0.6.1
37 | Pillow==9.0.1
38 | preshed==3.0.6
39 | pyarrow==7.0.0
40 | pydantic==1.8.2
41 | pyparsing==3.0.7
42 | pyphen==0.12.0
43 | python-dateutil==2.8.2
44 | pytz==2021.3
45 | pytz-deprecation-shim==0.1.0.post0
46 | PyYAML==6.0
47 | regex==2022.3.2
48 | requests==2.27.1
49 | responses==0.18.0
50 | sacremoses==0.0.47
51 | scikit-learn==1.0.2
52 | scipy==1.8.0
53 | sentence-transformers==2.2.0
54 | sentencepiece==0.1.96
55 | six==1.16.0
56 | smart-open==5.2.1
57 | spacy==3.2.3
58 | spacy-legacy==3.0.9
59 | spacy-loggers==1.0.1
60 | srsly==2.4.2
61 | textacy==0.12.0
62 | thinc==8.0.13
63 | threadpoolctl==3.1.0
64 | tokenizers==0.11.6
65 | toolz==0.11.2
66 | torch==1.12.0.dev20220303
67 | torchaudio==0.12.0.dev20220303
68 | torchvision==0.13.0.dev20220303
69 | tqdm==4.63.0
70 | transformers==4.17.0
71 | typer==0.4.0
72 | typing_extensions==4.1.1
73 | tzdata==2021.5
74 | tzlocal==4.1
75 | urllib3==1.26.8
76 | wasabi==0.9.0
77 | xxhash==3.0.0
78 | yarl==1.7.2
79 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | 
2 |    
3 | from setuptools import setup, find_packages
4 | setup(
5 |     name = 'ngec',
6 |     packages = find_packages(),
7 | )


--------------------------------------------------------------------------------
/setup/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 | 
3 | This directory includes code for setting up the offline Wikipedia index and for training custom event detection models. Please see each directory for code and documentation.


--------------------------------------------------------------------------------
/setup/train_classifiers/README.md:
--------------------------------------------------------------------------------
 1 | # Training classifiers for NGEC
 2 | 
 3 | This directory contains example code for training event classifiers to use in the NGEC pipeline.
 4 | 
 5 | Because of conditions imposed by our funder and the proprietary data used in the project, we cannot share the training data or the trained models used to produce the POLECAT event dataset. However, we can provide example code for training classifiers on your own data and demonstration pretrained models for the event categories used in the POLECAT dataset that draw on a corpus of pseudo-labeled synthetic news stories. These classifiers are not as accurate as the ones used in the POLECAT dataset, but work pretty well and could easily be improved with additional training data. For these demonstration classifiers, we use a synthetic data approach described in [Halterman (2023)](https://arxiv.org/abs/2303.16028), which prompts news articles with the desired event types by providing hand-written titles to elicit news from a language model. We then use the language model's predictions as pseudo-labels to train a classifier. 
 6 | 
 7 | Our primary objective with this pipeline is to encourage other researchers to develop custom event datasets for their own purposes. 
 8 | Most researchers will want to train custom classifiers using their own event ontologies, which requires generating new training data.
 9 | 
10 | ## Contents
11 | 
12 | - `fit_event_classifier.py`: code to implement a simple multi-label, multi-class classifier. The core classification model is a logistic regression model on top of a sentence embedding produced by [sentence-transformer model](sentence-transformers/paraphrase-mpnet-base-v2).
13 | - `generate_synthetic_news.py`: code to generate synthetic news stories using an offline Huggingface pretrained language model.
14 | - `headlines_event_mode.csv`: a list of hand-written headlines used to prompt the language model to generate news stories.
15 | - `gpt_synthetic_events_2023-04-06.jsonl.zip`: a zip file containing around 1,800 synthetic news stories with event and mode pseudo-labels for training the event classifier.
16 | 
17 | 


--------------------------------------------------------------------------------
/setup/train_classifiers/fit_event_classifier.py:
--------------------------------------------------------------------------------
  1 | from sentence_transformers import SentenceTransformer
  2 | import pandas as pd
  3 | from sklearn.svm import SVC
  4 | import numpy as np
  5 | import skops.io as sio
  6 | import os
  7 | 
  8 | # path to sentence-transformer model directory if you've downloaded it
  9 | # Leave blank to download from huggingface on the fly
 10 | MODEL_DIR = "" 
 11 | #change device to 'cuda' if you have a GPU enabled
 12 | DEVICE = "cpu"
 13 | 
 14 | synth_df = pd.read_csv("gpt_synthetic_events_2023-10-19_19.csv.zip", 
 15 |                        compression="zip")
 16 | 
 17 | 
 18 | def load_model(model_name="paraphrase-mpnet-base-v2"):
 19 |     if MODEL_DIR:
 20 |         # use the local copy
 21 |         model = SentenceTransformer(os.path.join(MODEL_DIR, model_name))
 22 |     else:
 23 |         # otherwise, download from huggingface
 24 |         model = SentenceTransformer(f'sentence-transformers/{model_name}')
 25 |     return model
 26 | 
 27 | model = load_model()
 28 | encoded = model.encode(synth_df['text'].values, show_progress_bar=True,
 29 |                                device=DEVICE).tolist()
 30 | synth_df['encoded'] = encoded
 31 | 
 32 | 
 33 | def fit_initial_model(synth_df):
 34 |     clf = SVC(class_weight="balanced",
 35 |                 kernel="linear",
 36 |                 probability=True,
 37 |                 C=0.1)
 38 |     y_train = synth_df['label']
 39 |     clf.fit(synth_df['encoded'].to_list(), y_train)
 40 |     pred = pd.DataFrame(clf.predict_proba(encoded))
 41 |     # rename columns with the event names
 42 |     pred.columns = clf.classes_
 43 |     return pred
 44 | 
 45 | pred = fit_initial_model(synth_df)
 46 | synth_df = pd.concat([synth_df, pred], axis=1)
 47 | event_types = synth_df['label'].unique()
 48 | 
 49 | for event in event_types:
 50 |     print(event)
 51 |     # First, sample the positive cases (assuming the prompts are reliable)
 52 |     train_pos_synth = synth_df[synth_df['label'] == event].copy()
 53 |     train_pos_synth['label'] = 1
 54 |     # Now sample negative cases, but don't pick anything that might
 55 |     # be a positive case
 56 |     candidate_neg = synth_df[synth_df[event] < 0.05].copy()
 57 |     # Take 3x as many negative cases as positive cases, or
 58 |     # as many as we can find
 59 |     sample_size = min(candidate_neg.shape[0], 
 60 |                       train_pos_synth.shape[0] * 3)
 61 |     train_neg_synth = candidate_neg.sample(sample_size).copy()
 62 |     train_neg_synth['label'] = 0
 63 |     # Now combine the positive and negative cases
 64 |     print(train_pos_synth.shape, train_neg_synth.shape)
 65 |     train = pd.concat([train_pos_synth, train_neg_synth], axis=0)
 66 |     X_train = np.array(train['encoded'].tolist())
 67 |     y_train = train['label']
 68 |     clf = SVC(class_weight="balanced",
 69 |                 kernel="linear",
 70 |                 probability=True)
 71 |     clf.fit(X_train, y_train)
 72 | 
 73 |     sio.dump(clf, f"models/{event}.skops")
 74 |     
 75 | 
 76 | ## For production use, see https://github.com/ahalterman/NGEC/blob/main/NGEC/event_class.py
 77 | 
 78 | 
 79 | 
 80 | 
 81 | ## A bunch of stuff that didn't really work
 82 | 
 83 | # convert single label y to multi-label y
 84 | #from sklearn.preprocessing import MultiLabelBinarizer
 85 | #mlb = MultiLabelBinarizer()
 86 | #y_train_bin = mlb.fit_transform([[i] for i in y_train])
 87 | #y_val_bin = mlb.fit_transform([[i] for i in y_val])
 88 | #
 89 | ## train multi-label logistic regression model
 90 | #clf = RandomForestClassifier(class_weight="balanced")
 91 | #clf.fit(X_train, y_train_bin)
 92 | #y_pred = clf.predict_proba(X_val)
 93 | #print(classification_report(y_val_bin, y_pred))
 94 | #
 95 | #from cleanlab.classification import CleanLearning
 96 | #
 97 | #cl = CleanLearning(clf)
 98 | #cl.fit(X_train, y_train_bin)
 99 | #
100 | ### One-by-one classifiers
101 | #from pulearn import WeightedElkanotoPuClassifier
102 | #
103 | #y_train = np.array(train['event'] == "ASSAULT").astype(int) * 2 - 1
104 | #
105 | #clf = LogisticRegression(C=0.4, class_weight="balanced")
106 | #clf.fit(X_train, y_train)
107 | #
108 | #y_val = np.array(val['event'] == "ASSAULT").astype(int) * 2 - 1
109 | #y_pred = clf.predict(X_val)
110 | #print(classification_report(y_val, y_pred))
111 | #
112 | #
113 | ## Experimented with PU learning, but it didn't work well
114 | #from pulearn import WeightedElkanotoPuClassifier
115 | #pu_estimator = WeightedElkanotoPuClassifier(
116 | #    estimator=clf, labeled=10, unlabeled=20, hold_out_ratio=0.2)
117 | #pu_estimator.fit(X_train, y_train)
118 | #
119 | #y_pred = pu_estimator.predict(X_val)
120 | #print(classification_report(y_val, y_pred))


--------------------------------------------------------------------------------
/setup/train_classifiers/generate_synthetic_news.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline, set_seed
 2 | import re
 3 | from tqdm import tqdm
 4 | import random
 5 | import pandas as pd
 6 | import datetime
 7 | 
 8 | text_generator = pipeline('text-generation', model='gpt2-xl', device=0)
 9 | #set_seed(42)
10 | 
11 | prompt = "Thousands of Soldiers Deployed to Czech Border to Address Unfolding Crisis\n\n(BBC Monitoring) --"
12 | text_generator(prompt,  max_length=300)
13 | 
14 | cities = ["Abuja", "Kabul", "Belgrade", "Zaghreb", "Khartoum", "Vienna", "Dhaka", "Brussels", 
15 |           "Minsk", "Kinshasa", "Beijing", "Bogota", "Sao Paulo", "Havana", "Berlin", "Prague",
16 |           "Moscow", "Washington", "Cairo", "Jerusalem", "Dehli", "Tehran", "Rome", "Amman", 
17 |           "Beirut", "Tokyo", "Nairobi", "New York", "Panama City", "Oslo", "Damascus",
18 |           "Bangkok", "Istanbul", "London", "Abu Dhabi"]
19 | 
20 | c_df = pd.read_csv("countries.csv")
21 | countries = c_df['Name'].to_list()
22 | 
23 | def make_stories(prompt, source, pattern, max_len=100, n=5):
24 |     output = text_generator(prompt, 
25 |                             max_length=max_len, 
26 |                             num_return_sequences=n,
27 |                             pad_token_id=50256
28 |                             )                   
29 |     selected = []
30 |     for out in output:
31 |         out['text'] = re.sub(re.escape(prompt), "", out['generated_text'])
32 |         #toks = set([i.lower() for i in out['text'].split(" ")])
33 |         selected.append(out)
34 |     final = []
35 |     for i in selected:
36 |         disclaimer = "### THIS IS A SYNTHETIC STORY. DO NOT TRUST THE FACTUAL CONTENT OF THIS TEXT. Created by Andy Halterman to train a document-level political event classifer ###"
37 |         text = disclaimer + i['text'].strip()
38 |         d = {"text": text,
39 |             "title": pattern['title'],
40 |             "source": source,
41 |             "prompt": prompt,
42 |             "label": pattern['event'],
43 |             "mode": pattern['mode']}
44 |         final.append(d)
45 |     return final
46 | 
47 | def make_prompt_and_gen(pattern, 
48 |                   source, 
49 |                   max_len=100, 
50 |                   unique_prompts=5, 
51 |                   n_per_city=5):
52 |     all_stories = []
53 |     for n in range(unique_prompts):
54 |         city = random.sample(cities, 1)[0]
55 |         country_1, country_2, country_3 = random.sample(countries, 3)
56 |         headline = pattern['title'].format(country_1=country_1, 
57 |                                               country_2=country_2,
58 |                                               country_3=country_3,
59 |                                               city=city)
60 |         prompt = f"{headline}\n\n({source}) --"
61 |         stories = make_stories(prompt, source, pattern, n=n_per_city, max_len=max_len) 
62 |         all_stories.extend(stories)
63 |     return all_stories
64 | 
65 | 
66 | def run():
67 |     patterns = pd.read_csv("synthetic_headlines.csv")
68 |     patterns = patterns.sample(frac=1)
69 | 
70 |     all_output = []
71 |     for n, pattern in tqdm(patterns.iterrows(), total=patterns.shape[0]):
72 |         if not pattern['title']:
73 |             continue
74 |         print(pattern['event'])
75 |         for source in ['Reuters', 'AFP', 'BBC Monitoring', 'AP', 'local sources', 'local media']:
76 |             out = make_prompt_and_gen(pattern, source, max_len=300, unique_prompts=2, n_per_city=1)
77 |             all_output.extend(out)
78 |         #except Exception as e:
79 |         #    print(e)
80 |     df = pd.DataFrame(all_output)
81 |     #df.to_csv("gpt_synthetic_events_cities.csv")
82 |     today = datetime.datetime.today().strftime('%Y-%m-%d_%H')
83 |     df.to_csv(f"gpt_synthetic_events_{today}.csv")
84 | 
85 | 
86 | if __name__ == "__main__":
87 |     run()
88 | 


--------------------------------------------------------------------------------
/setup/train_classifiers/gpt_synthetic_events_2023-10-19_19.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahalterman/NGEC/787a06ba8df063fec0b718a9c24b6088c938037b/setup/train_classifiers/gpt_synthetic_events_2023-10-19_19.csv.zip


--------------------------------------------------------------------------------
/setup/train_classifiers/synthetic_headlines.csv:
--------------------------------------------------------------------------------
  1 | event,mode,title
  2 | ACCUSE,,{country_1} Leader Criticizes {country_2}'s Response to Crisis
  3 | ACCUSE,,"Activists Decry {city} Bombings, Highlighting Civilian Casualties"
  4 | ACCUSE,DISAPPROVE,{country_1} Leader Condemns Human Rights Violations in Speech 
  5 | ACCUSE,DISAPPROVE,{country_1} condemns {country_2} for treatment of migrants
  6 | ACCUSE,DISAPPROVE,{country_1} Citizens Express Widespread Disapproval for Leader in Recent Polls
  7 | ACCUSE,INVESTIGATE,ICC States that it will Begin Investigation of Alleged War Crimes in {country_1}
  8 | ACCUSE,INVESTIGATE,Investigation begins into last week's events
  9 | ACCUSE,INVESTIGATE,Police in {country_1} begin probe into last week's scandal
 10 | ACCUSE,INVESTIGATE,Commission forms to study the causes of last year's disaster
 11 | ACCUSE,INVESTIGATE,International Body to Establish Inquiry into Possible {country_1} Genocide
 12 | ACCUSE,ALLEGE,{country_1} President Charged with Human Rights Violations
 13 | ACCUSE,ALLEGE,Six indicted by grand jury
 14 | ACCUSE,ALLEGE,{country_1} brings official complaint against {country_2} for currency manipulation
 15 | ACCUSE,ALLEGE,{country_1}: Former Politician Faces Charges in Bribery Scheme
 16 | ACCUSE,DISAPPROVE,{country_1} Denounces Actions By {country_2}
 17 | ACCUSE,DISAPPROVE,"{country_2} Spokesperson: ""{country_2}'s Behavior is Deplorable"""
 18 | ACCUSE,DISAPPROVE,NGO in {city} Issues Statement Condemning Government
 19 | ACCUSE,DISAPPROVE,Moderate Candidates in {country_1} Denounce Far-Right Party
 20 | ACCUSE,DISAPPROVE,{country_1} Condemns Violence Against Civilians in Ongoing War
 21 | ACCUSE,DISAPPROVE,Activists in {city} Decry New Government Policy
 22 | ACCUSE,ALLEGE,"{country_1} Accuses {country_2} of Trade Violations, Vows WTO Case"
 23 | ACCUSE,ALLEGE,Police in {city} Allege Local Companies Violated Law
 24 | ACCUSE,ALLEGE,{city} Police Charge Dozens After Protest
 25 | ACCUSE,ALLEGE,{country_1} Alleges {country_2} Violated Laws of War
 26 | ACCUSE,ALLEGE,"{city} Man Sues Government, Alleging Rights Violations"
 27 | ACCUSE,ALLEGE,Local NGO Accuses Government of Coverup
 28 | ACCUSE,ALLEGE,Justice Ministry Brings Case Against Former Officials ({country_1})
 29 | ACCUSE,INVESTIGATE,{city} Police Investigating Shooting Death of Local Man
 30 | ACCUSE,INVESTIGATE,{country_1} Convenes Grand Jury to Investigate Corruption
 31 | ACCUSE,INVESTIGATE,{country_1} Begins Investigation Into Past Human Rights Violations
 32 | ACCUSE,INVESTIGATE,{country_1} Launches Commission on {country_2} Abuses
 33 | ACCUSE,INVESTIGATE,Activists Investigate Brutality During {country_1} War
 34 | ACCUSE,INVESTIGATE,{country_1} Parliament Establishes Special Investigative Committee
 35 | ACCUSE,INVESTIGATE,Police in {city} Appeal For Help in Investigation
 36 | AGREE,,"{country_1}, {country_2} Agree to Hold Regular High-Level Meetings"
 37 | AGREE,,{country_1} Leader Agrees to Provide Military Aid to {country_1} In Response to Conflict
 38 | AGREE,,{country_1} Expresses Willingness to Cooperate on New Regional Security Framework
 39 | AGREE,,Finance Ministers Gathered in {city} Agree to Impose New Banking Capital Requirements
 40 | AGREE,,{country_1} and {country_2} have Promised to Ratify a New Treaty
 41 | AGREE,,{country_1} Offers Humanitarian Support to {city} Following Yesterday's Disaster
 42 | AGREE,,{country_1} and {country_2} Agree to New Limits on Conventional Weapons
 43 | AGREE,,Mayor of {city} Reaches an Agreement with Striking Workers
 44 | AGREE,,Leader of {country_1} Expresses New Willingness to Work with the International Community
 45 | AGREE,,Scoop: Secret Negotiations in {city} Produce Promises to Support Rebels
 46 | AID,,"{country_1}, {country_2} Sign Billion Dollar Aid Package"
 47 | AID,,{country_1} Announces Transfer of Additional Humanitarian Aid to {country_2} in Wake of Disaster
 48 | AID,,Military trainers and arms shipment arrive in war-torn {country_1}
 49 | AID,,Emergency humanitarian aid arrives in famine-affected areas of {country_1}
 50 | AID,,U.N. provides humanitarian aid in the wake of last week's disaster
 51 | AID,,Four activists granted asylum in local embassy
 52 | AID,,Major debt relief as IMF agrees to forgive millions owned by {country_1}
 53 | ASSAULT,,{city} Ethnic Clashes Claim 11 Lives
 54 | ASSAULT,,"26 Dead in Jihadist Attacks this Week, Media Reports"
 55 | ASSAULT,AERIAL,{country_1}: Four killed in air strike
 56 | ASSAULT,AERIAL,War planes pummel rebel positions in {country_1}
 57 | ASSAULT,AERIAL,"Allied aircraft enforce no-fly-zone, shooting down {country_1} fighter plane"
 58 | ASSAULT,ABDUCT,"{country_1} Mayor Kidnapped, Killed"
 59 | ASSAULT,ABDUCT,13 Students Abducted from School in {city} by Insurgent Group
 60 | ASSAULT,BEAT,Police Searching for Suspects after Prominent Elder Beaten
 61 | ASSAULT,BEAT,{country_1} Police Officers Beat Protestors at Opposition Rally
 62 | ASSAULT,TORTURE,{country_1}: Widespread Torture During Government Occupation
 63 | ASSAULT,TORTURE,"{country_1} Responsible for Torture, Killing of Prominent Activist"
 64 | ASSAULT,EXECUTE,Video Purpotedly Shows Execution of Second Hostage
 65 | ASSAULT,EXECUTE,Two Local Administrators Beheaded by Insurgents in South {country_1}
 66 | ASSAULT,SEXUAL,Army Urged to End Tactics of Sexual Violence During Ongoing Offensive in West {country_1}
 67 | ASSAULT,SEXUAL,"Survivors of Attack Describe Rapes, Other Atrocities"
 68 | ASSAULT,ASSASSINATE,{country_1} Lawmaker Killed in Targeted Attack in {city}
 69 | ASSAULT,ASSASSINATE,Reporter Shot and Killed by Masked Gunmen in {city}
 70 | ASSAULT,DESTROY,{country_1} Military Razes Two Homes in Response to Deadly Attack
 71 | ASSAULT,DESTROY,"Bandits Injure Six, Raze 47 Houses in {city} Villages"
 72 | ASSAULT,PRIMATIVE,Militants Behead Local Official in {city} Over Government Policy
 73 | ASSAULT,PRIMATIVE,Priest Burned to Death in Southern {country_1}
 74 | ASSAULT,PRIMATIVE,Angry mob throws rocks and bottles
 75 | ASSAULT,PRIMATIVE,Local opposition leader beaten with baseball bat
 76 | ASSAULT,FIREARMS,"Gunmen Storm Agency Headquarters, Kill Police Officers, Several Civilians"
 77 | ASSAULT,FIREARMS,Gunmen Kill 13 in Fresh Attack on {city}
 78 | ASSAULT,EXPLOSIVES,"{country_1} official: Blast rocks country's capital, killing 11"
 79 | ASSAULT,EXPLOSIVES,"Eight Killed, over 45 Injured in {city} Bomb Explosion"
 80 | ASSAULT,SUICIDE-ATTACK,Extremist Group Claims Responsibility for Recent Suicide Bombing
 81 | ASSAULT,SUICIDE-ATTACK,Suicide Car Bomb in {city} Kills At Least 44
 82 | ASSAULT,AERIAL,"{country_1} Air Force Bombs Training Camp, No Survivors"
 83 | ASSAULT,AERIAL,{country_1} Army Says 14 Killed in Air Strike Were Terrorists
 84 | ASSAULT,DRONE,"Drone Strike Killed 2 Civillians, Family Says"
 85 | ASSAULT,DRONE,Drone strikes increase as {country_1} conflict intensifies
 86 | ASSAULT,DRONE,{country_1} UAV destroys enemy targets in {country_2}
 87 | ASSAULT,DRONE,{country_1} Kills Senior Commander in {country_2} With Drone Strike
 88 | ASSAULT,HEAVY-WEAPONS,Shelling in {city} Kills Six Children
 89 | ASSAULT,HEAVY-WEAPONS,Border Shelling Kills Three Civilians in Renewed Tensions between {country_1} and {country_2}
 90 | ASSAULT,CROWD-CONTROL,4 Injured as Police Turn on Protestors in {city}
 91 | ASSAULT,CROWD-CONTROL,Witness: Military Uses Tear Gas on Rally in Central {city}
 92 | ASSAULT,CLEANSING,Mass Deportations Reported in {country_1} in Lead Up to Summit
 93 | ASSAULT,CLEANSING,{country_1} Conflict Enters New Stage with Ethnic Cleansing
 94 | ASSAULT,MASSACRE,{country_1} Bandits Kill 75 in Pre-dawn Massacre: Family Members
 95 | ASSAULT,MASSACRE,"Militants Attack {city} Village, Killing 65 Civilians"
 96 | ASSAULT,UNCONVENTIONAL,Observers Report Use of Chemical Weapons on Civilians in {country_1}
 97 | ASSAULT,UNCONVENTIONAL,Gas Attack in {city} Kills Scores Amidst Government Denials
 98 | COERCE,,"Restrictions to Individual Rights, Media Freedoms Imposed in {country_1} Following Transition"
 99 | COERCE,,Outrage over {country_1} Repression
100 | COERCE,SIEZE,"Police Raid Human Rights Organization Offices, Seize Computers "
101 | COERCE,SIEZE,Observers Worry About Indiscriminate Police Raids on Ethnic Community in {country_1}
102 | COERCE,RESTRICT,{country_1} Village under Military Lockdown Reportedly Running Out of Food
103 | COERCE,RESTRICT,{country_1} City Blockaded by {country_2} Troops
104 | COERCE,BAN,{country_1}: Three Human Rights NGOs Banned for Ties to Foreign Governments
105 | COERCE,BAN,{country_1} Independence Party Formally Outlawed Under New Law
106 | COERCE,CENSOR,{country_1} Tightens the Screws on Media Freedoms
107 | COERCE,CENSOR,Social Media Platforms Temporarily Suspended in North {country_1} Following Recent Unrest
108 | COERCE,CURFEW,"Turmoil in {country_1}: Curfew Imposed After Clashes, Murders"
109 | COERCE,CURFEW,Nationwide Curfew Imposed Following Recent Protests
110 | COERCE,MARTIAL-LAW,{country_1} Imposes Martial Law in Response to Election Violence
111 | COERCE,MARTIAL-LAW,Prime Minister to Extend Martial Law for Another Week
112 | COERCE,ARREST,Arbitrary Arrests in {country_1} Condemned by International Observers
113 | COERCE,ARREST,{country_1} Arrests 20 in Regional Crackdown
114 | COERCE,DEPORT,{country_1} Expels Human Rights Activist in Response to Accusations
115 | COERCE,DEPORT,Failed Asylum Seeker Deported Back to {country_1}
116 | COERCE,WITHHOLD,{country_1} Cuts Off Internet in Some Provinces In Light of Unrest
117 | COERCE,WITHHOLD,Internet Shutdown in {country_1} Begins In Anticipation of Renewed Violence
118 | COERCE,MISINFORMATION,{country_1} Says Use of Misinformation Response to Ongoing External Threat from {country_2}
119 | COERCE,MISINFORMATION,Facebook admits to 'coordinated misinformation' aimed at upcoming election in {country_1}
120 | COERCE,MISINFORMATION,Foreign 'information operation' spreading disinformation uncovered in {country_1}
121 | COERCE,MISINFORMATION,Incumbent Spreading Fake News Ahead of Upcoming {country_1} Election
122 | COERCE,MISINFORMATION,State actors are behind disinformation campaign says {country_1}
123 | COERCE,CYBER,{country_1} News Site Hit by Cyber Attack
124 | COERCE,CYBER,"Cyber warfare is already happening, says {country_1} government spokesperson"
125 | COERCE,CYBER,{country_1} Cyberattack Takes Down Communications Network for Several Hours
126 | COERCE,CYBER,Power plant disabled by cyber attack
127 | COERCE,CYBER,Foreign hackers target government network in {country_1}
128 | CONCEDE,,Mayor of {city} Promises to End Curfew After Upcoming Elections
129 | CONCEDE,,Candidate Promises to Relax Restrictions on Migration if Elected
130 | CONCEDE,,Mayor of {city} Announced that Police Will Not Beat Protestors
131 | CONCEDE,,Demonstrators Claim Victory as {country_1} Agrees to All Demands
132 | CONCEDE,,Rebels in {country_1} Have Announced Their Intention to Lay Down Arms
133 | CONCEDE,,{country_1}: Evening Curfew will be Lifted Starting Tomorrow
134 | CONCEDE,,Workers Will End Strike After {country_1} Promises Improved Working Conditions
135 | CONCEDE,,{country_1} Will Withdraw Complaint at the WTO Against {country_2}
136 | CONCEDE,,{country_1} Will Shutter Controversal Project After Massive Objections
137 | CONCEDE,,{country_1} Agrees to Withdraw from Seized Territory
138 | CONCEDE,,{country_1} Gives Into Demands Made By {country_2}
139 | CONCEDE,,Government of {country_1} Agrees to Concessions in Order to Avert Crisis
140 | CONCEDE,,Concessions Made By {country_1} in Order to Avoid Escalation of Conflict
141 | CONCEDE,,Concessions Made in Order to Secure Peace
142 | CONCEDE,,Government in {country_1} Makes Concessions in Face of Pressure
143 | CONCEDE,,{city}: Opposition Group Concedes After Meeting with Prime Minister
144 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Immigration
145 | CONCEDE,,{country_1} Makes Concessions to the EU
146 | CONCEDE,,Rebel Group Makes Concessions to the {country_1} Government on Peace Talks
147 | CONCEDE,,{country_1} Makes Concessions to {country_2} on {country_3} Militants
148 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Arms Embargo
149 | CONCEDE,,Political group agrees to halt protests in return for promises from {country_1}
150 | CONCEDE,,{country_1} agrees to remove legal restrictions on local ethnic group
151 | CONCEDE,,Activists agrees to suspend protests in return for rebels in {country_1} agreeing to a ceasefire
152 | CONCEDE,,Rebels in {country_1} agree to stop fighting in return for {country_1}'s future concessions
153 | CONCEDE,,{country_1} declares cease-fire with {country_2}
154 | CONCEDE,,"Administrative restrictions lifted on organization following talks with {country_1} officials"""
155 | CONCEDE,,Curfew lifted in {city} following agreement between mayor and protesters
156 | CONCEDE,,{country_1} Promises Concessions to Opposition
157 | CONCEDE,,{country_1}  Makes Concessions to {country_2} in Nuclear Talks
158 | CONCEDE,,{country_1} eases administrative restrictions on civil society organizations
159 | CONCEDE,,{city} authorities agree to remove curfew in restive city
160 | CONCEDE,,{country_1} opposition suspends protests for two weeks
161 | CONCEDE,,Parties in {country_1} war declare ceasefire and agree to withdrawal
162 | CONCEDE,,{country_1} makes verbal commitment to ease economic restrictions
163 | CONCEDE,,Reformers gain important concessions in talks with {country_1}
164 | CONCEDE,,President of {country_1} Suspends Plans to Impose Curfews in Capital
165 | CONCEDE,,Israeli Prime Minister Benjamin Netanyahu agrees to halt settlement construction in the West Bank
166 | CONCEDE,,"Rebels in {country_1} announce suspension of attacks against government forces"""
167 | CONCEDE,,Rebels in {country_1} declare a nationwide ceasefire
168 | CONCEDE,,{city}: Opposition Group Concedes After Meeting with Prime Minister
169 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Immigration
170 | CONCEDE,,{country_1} Makes Concessions to the EU
171 | CONCEDE,,Rebel Group Makes Concessions to the {country_1} Government on Peace Talks
172 | CONCEDE,,{country_1} Makes Concessions to {country_2} on {country_3} Militants
173 | CONCEDE,,{country_1} Makes Concessions to {country_2} on Arms Embargo
174 | CONCEDE,,Political group agrees to halt protests in return for promises from {country_1}
175 | CONCEDE,,{country_1} agrees to remove legal restrictions on local ethnic group
176 | CONCEDE,,Activists agrees to suspend protests in return for rebels in {country_1} agreeing to a ceasefire
177 | CONCEDE,,Rebels in {country_1} agree to stop fighting in return for {country_1}'s future concessions
178 | CONCEDE,,{country_1} declares cease-fire with {country_2}
179 | CONCEDE,,"Administrative restrictions lifted on organization following talks with {country_1} officials"""
180 | CONCEDE,,Curfew lifted in {city} following agreement between mayor and protesters
181 | CONCEDE,,{country_1} Promises Concessions to Opposition
182 | CONCEDE,,{country_1}  Makes Concessions to {country_2} in Nuclear Talks
183 | CONCEDE,,{country_1} eases administrative restrictions on civil society organizations
184 | CONCEDE,,{city} authorities agree to remove curfew in restive city
185 | CONCEDE,,{country_1} opposition suspends protests for two weeks
186 | CONCEDE,,Parties in {country_1} war declare ceasefire and agree to withdrawal
187 | CONCEDE,,{country_1} makes verbal commitment to ease economic restrictions
188 | CONCEDE,,Reformers gain important concessions in talks with {country_1}
189 | CONCEDE,,President of {country_1} Suspends Plans to Impose Curfews in Capital
190 | CONCEDE,,Israeli Prime Minister Benjamin Netanyahu agrees to halt settlement construction in the West Bank
191 | CONCEDE,,"Rebels in {country_1} announce suspension of attacks against government forces"""
192 | CONCEDE,,Rebels in {country_1} declare a nationwide ceasefire
193 | CONSULT,,{country_1}'s Foreign Minister Embarks on Tour of Neighboring Countries
194 | CONSULT,,Negotiations Ongoing at International Climate Conference
195 | CONSULT,VISIT,{country_1} Leader Visits {country_2} to Deepen Ties
196 | CONSULT,VISIT,{country_1} President Visits {city} As Part of Multi-country Tour
197 | CONSULT,THIRD-PARTY,{country_1} Hosts Peace Talks Between {country_2} Government and Rebels in {city}
198 | CONSULT,THIRD-PARTY,Meetings Between {country_1} and {country_2} Representatives Begin in {city} as {country_1} Attempts to Broker Ceasefire
199 | CONSULT,MULTILATERAL,"Key Outcome Agreed to at Recent Global Summit over Climate Change, Environment"
200 | CONSULT,MULTILATERAL,Multilateral Talks Begin in {city} Over Next Phase of Trade Integration
201 | CONSULT,PHONE,{country_1} President Defends his Call to {country_2} Leader Over Dispute
202 | CONSULT,PHONE,"{country_1}, {country_2}, Military Leaders Speak by Phone Amid Heightened Tensions over {country_3}"
203 | COOPERATE,,{country_1} and {country_2} Sign Trade Deal in Renewed Push for Economic Cooperation
204 | COOPERATE,,"{country_1}, {country_2}, Hold Joint Military Drills in Southeast {country_1}"
205 | COOPERATE,,{country_1} and {country_2} Announce Common Currency
206 | COOPERATE,,{country_1} and {country_2} Expand Intelligence Sharing
207 | COOPERATE,,{country_1} and {country_2} Expand Judicial Cooperation
208 | COOPERATE,,{country_1} and {country_2} Begin Joint Military Exercises
209 | COOPERATE,,NGOs Gathered in {city} Establish New Working Group
210 | COOPERATE,,Rebels in {country_1}'s Civil War Begin Operating Under New Joint High Command
211 | COOPERATE,,Record-Breaking Trade Between {country_1} and {country_2}
212 | COOPERATE,,Troops From {country_1} and {country_2} Held Joint Military Maneuvers
213 | COOPERATE,,{country_1} Begins Importing Goods from {country_2}
214 | COOPERATE,,{city} Businesses Increase Investment in {country_1}
215 | COOPERATE,,Team From {city} Arrives in {country_1} To Begin Cooperation
216 | COOPERATE,,{country_1} and {country_2} Sign New Extradition Treaty
217 | COOPERATE,,{country_1} Started Exporting Goods to {country_2} On Wednesday
218 | COOPERATE,,Bilateral Trade Between {country_1} and {country_2} Growing Rapidly
219 | COOPERATE,,Leaders from {country_1} and {country_2} Broke Ground on New Joint Infrastructure Project
220 | COOPERATE,,{country_1} and {country_2} Ratify New Cooperation Agreement
221 | COOPERATE,,Manufacturers in {country_1} Create Joint Venture With Factories in {country_2}
222 | COOPERATE,,{country_1} Warship Conducts Joint Training Exercises with {country_2} Navy
223 | MOBILIZE,,Thousands of Soldiers Deployed to {country_1} Border to Address Unfolding Crisis
224 | MOBILIZE,,{country_1} Increases Military Readiness in Response to {country_2}
225 | MOBILIZE,TROOPS,{country_1} Army Calls Upon Conscripts in Preparation for Renewed Offensive
226 | MOBILIZE,TROOPS,Thousands of Troops Deployed to southern {country_1} Amid Simmering Tensions
227 | MOBILIZE,WEAPONS,Military Ramps up Production in Anticipation of Conflict with {country_1}
228 | MOBILIZE,WEAPONS,{country_1} Deploys Missile Defense System to Bolster Position
229 | MOBILIZE,POLICE,Hundreds of Police Called to Capital Following Recent Violence
230 | MOBILIZE,POLICE,Governor orders police to {city} center
231 | MOBILIZE,POLICE,Police Deployed Ahead of {city} Protests
232 | MOBILIZE,MILITIA,Militia Expands Recruitment in Northwest As Talks Fail
233 | MOBILIZE,MILITIA,"Pro-government militia begins operating near the {country_1} capital
234 | "
235 | MOBILIZE,MILITIA,Pro-government aaramilitary forces begin patrols in {country_1}
236 | MOBILIZE,MILITIA,Far-right paramilitary group formed in {country_1}
237 | MOBILIZE,MILITIA,{country_1} Militia Members Called into Action over Recent Tensions
238 | PROTEST,,Hundreds Rally in Capital over Recent Election Violence
239 | PROTEST,,Demonstrators Unite across {country_1} as Protests Spread
240 | PROTEST,DEMO,{country_1}: Peaceful Demonstration Target of Violent Crackdown
241 | PROTEST,DEMO,Dozens march in downtown demonstration
242 | PROTEST,DEMO,Peaceful protests begin across {city}
243 | PROTEST,DEMO,"Over 5,000 Unite for Peaceful Demonstration in Central {city}"
244 | PROTEST,RIOT,{city} Police Station Burned Down after Night of Rioting
245 | PROTEST,RIOT,Riots break out across {country_1} following ethnic violence
246 | PROTEST,RIOT,Shops smashed by violent protestors in {country_1}
247 | PROTEST,RIOT,Recent Ethnic Riots in {country_1} Cast Doubt on Transition to Democracy
248 | PROTEST,RIOT,Police respond with force to rioters in the capital
249 | PROTEST,STRIKE,Transportation Workers Join Nationwide Strike in {country_1}
250 | PROTEST,STRIKE,Over Half of {country_1} Teachers Now Out on Wildcat Strike
251 | PROTEST,STRIKE,Union negotiations collapse as labor strike begins
252 | PROTEST,STRIKE,Workers in {country_1} announce total work stoppage
253 | PROTEST,HUNGER,Jailed {country_1} Activist on Hunger Strike in {country_2}
254 | PROTEST,HUNGER,{country_1}: Imprisoned Regime Critic Announces Hunger Strike
255 | PROTEST,HUNGER,Activists continue hunger strike
256 | PROTEST,BOYCOTT,11 Countries Shun Upcoming Summit Due to Concerns over Human Rights
257 | PROTEST,BOYCOTT,"{country_1}, {country_2}, Announce Boycotts of Upcoming {city} Conference"
258 | PROTEST,BOYCOTT,Local citizens begin boycotting businesses
259 | PROTEST,OBSTRUCT,Protest Continues to Disrupt Traffic on Highway near {city}
260 | PROTEST,OBSTRUCT,Protestors Block {country_1} Border Near {city}
261 | PROTEST,OBSTRUCT,Protestors block highway as part of demonstration
262 | REJECT,,{country_1} Government and Rebels Reject Offer of International Mediation
263 | REJECT,,{country_1} Government Rejects Request to Delay Upcoming Elections
264 | REJECT,ASSIST,Widespread Suffering as {country_1} Government Refuses Medical Aid
265 | REJECT,ASSIST,{country_1} Government Declines Humanitarian Assistance to {country_2} As Conditions Worsen
266 | REJECT,CHANGE,{country_1} Voters Reject Referendum on Peace Process
267 | REJECT,CHANGE,{country_1} Rejects Peace Agreement Between {country_2} and Rebels
268 | REJECT,YIELD,{city} Government Declines to End Weekend Curfew Amid Criticism from Rights Groups
269 | REJECT,YIELD,Rebels in {city} Reject Calls for Ceasefire After Mediation Efforts
270 | REJECT,MEET,{country_1} Leader Said to Reject Meeting with {country_1} Head of State During {city} Summit
271 | REJECT,MEET,{country_1}: Chief of Armed Forces Declines Request for High-level Meeting with {country_2} Counterpart
272 | REQUEST,,{country_1}: Presidential Candidate Demands Formal Review of {country_2} Tariffs
273 | REQUEST,,"Protestors Demand Faster Action on Unemployment, Inflation"
274 | REQUEST,ASSIST,"{country_1} asks {country_2} for Military, Economic Aid Amidst Ongoing Crisis"
275 | REQUEST,ASSIST,{country_1} Leader Expresses Need for International Assistance During Lockdown
276 | REQUEST,CHANGE,Activists in {country_1} Request End to Controversial Migration Policy
277 | REQUEST,CHANGE,Foreign Business Demand End to Discriminatory Policies in {city}
278 | REQUEST,YIELD,Human Rights Group Asks for Release of Final Political Prisoner Held in {country_1}
279 | REQUEST,YIELD,{country_1} Asks {country_2} to End Sanctions on Energy Sector
280 | REQUEST,MEET,{country_1} Leader Reportedly Requests Meeting with {country_2}'s President
281 | REQUEST,MEET,Peacekeepers Urge Rebel Leader to Meet with {country_1} Military in Renewed Mediation Push
282 | RETREAT,,{country_1} and {country_2} Establish Repatriation Deal
283 | RETREAT,,"After 13 Years, All Sides Agree to End {country_1} Conflict"
284 | RETREAT,WITHDRAW,"{country_1} Military Withdrawal from {country_2} Leads to Chaos, Uncertainty"
285 | RETREAT,WITHDRAW,Military forces withdraw from {country_1}
286 | RETREAT,WITHDRAW,{country_1} will begin the process of withdrawing some of their military forces from {country_2}
287 | RETREAT,WITHDRAW,{country_1} Army pulls forces out of contested region
288 | RETREAT,WITHDRAW,Insurgents Announce Withdrawal from Northern {country_1} Villages
289 | RETREAT,RELEASE,{country_1} Releases Three Prominent Labor Rights Activists
290 | RETREAT,RELEASE,"After Three Years of Negotiation, {country_1}'s Rebel Group Releases Hostages"
291 | RETREAT,RELEASE,Dozens of captives released in {city}
292 | RETREAT,RETURN,Court Rules that Police Must Return Confiscated Vehicle Seized as Evidence During Raid
293 | RETREAT,RETURN,{country_1} Rebels Return Seized Property as Part of Peace Accord
294 | RETREAT,DISARM,{country_1}'s Remaining Rebel Factions Agree to Disarm in Ongoing Push for Peace
295 | RETREAT,DISARM,Militia lays down arms in disarmament agreement
296 | RETREAT,DISARM,"Following Transition, {country_1} Rebels Formally Disband"
297 | RETREAT,CEASEFIRE,"{country_1}, {country_2}, Sign Ceasefire over Border Dispute"
298 | RETREAT,CEASEFIRE,Ceasefire begins in war-torn region of {country_1}
299 | RETREAT,CEASEFIRE,Hostilities temporarily stop as talks begin in {city}
300 | RETREAT,CEASEFIRE,All sides in {country_1} Conflict Agree to Implement Ceasefire
301 | RETREAT,ACCESS,{country_1} Peacekeepers Arrive in {country_2}
302 | RETREAT,ACCESS,Election Observers Allowed Back into {country_1} After Reforms Implemented
303 | RETREAT,RESIGN,{country_1} Leader Quits Following Corruption Accusations
304 | RETREAT,RESIGN,Mayor of {city} Resigns Amidst Investigation into Wrongdoing
305 | SANCTION,,{country_1} Suspends Relations with the West in Ongoing Tensions
306 | SANCTION,,{country_1} Downgrades Diplomatic Ties with {country_2} Amid Ongoing Spat
307 | SANCTION,CONVICT,{country_1}: Warlord Guilty of Crimes Against Humanity
308 | SANCTION,CONVICT,Court hands down convictions in corruption case
309 | SANCTION,CONVICT,Four found guilty after lengthy trial
310 | SANCTION,CONVICT,Former leader of {country_1} sentenced to prison for serious crimes
311 | SANCTION,CONVICT,{country_1} General Convicted of War Crimes and Genocide at UN Tribunal
312 | SANCTION,EXPEL,{country_1} Expels Dozens of Diplomats Following Accusations
313 | SANCTION,EXPEL,{country_1} Leader Ejects International Observers in {city}
314 | SANCTION,WITHDRAW,Rebels Quit {country_1} Peace Talks
315 | SANCTION,WITHDRAW,Human Rights Group Withdraws from {country_1} Amid Increased Security Concerns
316 | SANCTION,DISCONTINUE,{country_1} Faces Shortages as Sanctions Take Hold
317 | SANCTION,DISCONTINUE,{country_1} Imposes Sanctions on Company Linked to {country_2}
318 | SUPPORT,,{country_1} Renews its Support of the World Health Organization
319 | SUPPORT,,{country_1} Citizens Express Support for Ongoing Protests in {city}
320 | SUPPORT,,{country_1} and {country_2} Establish Diplomatic Ties
321 | SUPPORT,,{country_1} and {country_2} Sign Trade Agreement
322 | SUPPORT,,{country_1} Expresses Support for {country_2}'s Policy
323 | SUPPORT,,{country_1} Commends {country_2} for Their Action
324 | SUPPORT,,{country_1} Ratifies Treaty Signed by {country_2}
325 | SUPPORT,,{country_1} and {country_2} Resume Diplomatic Relations
326 | SUPPORT,,{country_1} and {country_2} Improve Diplomatic Cooperation
327 | SUPPORT,,{country_1} and {country_2} Expand Diplomatic Ties
328 | SUPPORT,,{country_1} Approves of {country_2}'s Policy
329 | SUPPORT,,{country_1} and {country_2} Cooperate on Climate Change
330 | SUPPORT,,{country_1} and {country_2} Express Mutual Respect for Each Other
331 | SUPPORT,,{country_1} commends {country_2} for taking steps to improve human rights
332 | SUPPORT,,{country_1} and {country_2} ratify agreement on cultural exchange
333 | SUPPORT,,{country_1} thanks {country_2} for support on political issues
334 | SUPPORT,,{country_1} commends {country_2}'s handling of recent crisis
335 | SUPPORT,,{country_1} and {country_2} commit to working together to resolve regional conflict
336 | SUPPORT,,{country_1} and {country_2} sign landmark peace agreement
337 | SUPPORT,,{country_1} and {country_2} pledge to increase cooperation
338 | SUPPORT,,{country_1} and {country_2} Establish Diplomatic Ties
339 | SUPPORT,,{country_1} and {country_2} Sign Trade Agreement
340 | SUPPORT,,{country_1} Expresses Support for {country_2}'s Policy
341 | SUPPORT,,{country_1} Commends {country_2} for Their Action
342 | SUPPORT,,{country_1} Ratifies Treaty Signed by {country_2}
343 | SUPPORT,,{country_1} and {country_2} Resume Diplomatic Relations
344 | SUPPORT,,{country_1} and {country_2} Improve Diplomatic Cooperation
345 | SUPPORT,,{country_1} and {country_2} Expand Diplomatic Ties
346 | SUPPORT,,{country_1} Approves of {country_2}'s Policy
347 | SUPPORT,,{country_1} and {country_2} Cooperate on Climate Change
348 | SUPPORT,,{country_1} and {country_2} Express Mutual Respect for Each Other
349 | SUPPORT,,{country_1} commends {country_2} for taking steps to improve human rights
350 | SUPPORT,,{country_1} and {country_2} ratify agreement on cultural exchange
351 | SUPPORT,,{country_1} thanks {country_2} for support on political issues
352 | SUPPORT,,{country_1} commends {country_2}'s handling of recent crisis
353 | SUPPORT,,{country_1} and {country_2} commit to working together to resolve regional conflict
354 | SUPPORT,,{country_1} and {country_2} sign landmark peace agreement
355 | SUPPORT,,{country_1} and {country_2} pledge to increase cooperation
356 | THREATEN,,Activists Raise Potential of Protests if Election Fraud Ignored
357 | THREATEN,,Union Workers Threaten Strike if Deal is Not Reached by Midnight
358 | THREATEN,RESTRICT,Government Floats Possibility of Major Restrictions on Movement over Protests
359 | THREATEN,RESTRICT,{country_1} Militants Threaten to Restrict Shipping Lanes
360 | THREATEN,BAN,{country_1} Suggests it May Ban Former General from Running in Election
361 | THREATEN,BAN,Officials in {country_1} threaten to ban opposition political parties
362 | THREATEN,BAN,{country_1} threatens to ban civil society organizations
363 | THREATEN,BAN,{country_1} Leader Raises Possibility of Bans on Opposition Parties
364 | THREATEN,ARREST,{country_1} Government to Detain Journalists if they Publicize Fraud Allegations
365 | THREATEN,ARREST,Police threaten to make arrests as unrest continues in {country_1}
366 | THREATEN,ARREST,Officials promise arrests if corruption continues
367 | THREATEN,ARREST,{country_1} Police Threaten Protestors with Arrest
368 | THREATEN,RELATIONS,Leader Says {country_1} Could Suspend Relations with Neighbor over Deal
369 | THREATEN,RELATIONS,Negotiators from {country_1} threaten to walk away from ongoing talks
370 | THREATEN,RELATIONS,{country_1} Raises Potential of Suspending Talks with {country_2} over Tensions
371 | THREATEN,EXPEL,{country_1} Threatens to Expel International Observers
372 | THREATEN,EXPEL,{country_1} Warns {country_2} of Retaliation after Diplomats Expelled
373 | THREATEN,TERRITORY,{country_1} Threatens to Annex Territory if Conditions Are Not Met
374 | THREATEN,TERRITORY,Why is {country_1} Threatening to Invade {country_2}?
375 | THREATEN,TERRITORY,{country_1} warns {country_2} that it plans to invade
376 | THREATEN,VIOLENCE,Insurgents Promise to Renew Bombing Campaign in South {country_1} if Talks Fail
377 | THREATEN,VIOLENCE,{country_1} Leader Threatens to End Protests with Military Force


--------------------------------------------------------------------------------
/setup/wiki/README.md:
--------------------------------------------------------------------------------
 1 | ## Wikipedia in Elasticsearch
 2 | 
 3 | The files in this repo will help you set up a an offline Wikipedia in Elasticsearch for easy querying.
 4 | 
 5 | ## Quickstart
 6 | 
 7 | To get started fast, you can download a pre-built Elasticsearch volume you can use in Docker. See the main NGEC/README.md page for quick-start instructions.
 8 | 
 9 | ## Building a Wikipedia index
10 | 
11 | You might have a few reasons for building your own offline Wikipedia index:
12 | 
13 | - Using a non-English Wikipedia
14 | - Creating an updated Wikipedia if the pre-built one is stale
15 | - Customizing the format or structure of the index (if so, you'll need to modify the code)
16 | 
17 | ## Index format
18 | 
19 | After ingesting, each Wikipedia article will be stored in the following form:
20 | 
21 | - `title`: the title of the Wikipedia page (no underscores)
22 | - `redirects`: every page that redirects to this page
23 | - `alternative_names`: alternative names for the article, identified from bold phrases in the first sentence 
24 | - `short_desc`: Wikipedia's "short description" of the article
25 | - `categories`: the Wikipedia categories associated with this page
26 | - `intro_para`: the cleaned text of the first paragraph of the article. All text after the intro paragraph is discarded for space reasons.
27 | - `infobox`: if the article includes a side infobox, it will be stored here.
28 | - `box_type`: articles can have different box formats, e.g. "legislature", "military unit", "settlement" 
29 | - `affiliated_people`: the contents of the 'leaders', 'founded_by', or 'founder' fields if present in the infobox. (This ends up not being used)
30 | 
31 | ## Setup
32 | 
33 | First, make sure that Redis is installed and running:
34 | 
35 | ```
36 | sudo apt-get install redis-server
37 | ```
38 | 
39 | Alternatively, you can use a Docker container:
40 | 
41 | ```bash
42 | docker run -d -p 6379:6379 --name redis redis
43 | ```
44 | 
45 | Then, install the Python requirements:
46 | 
47 | ```bash
48 | pip install -r requirements.txt
49 | ```
50 | 
51 | 
52 | ## Running
53 | 
54 | 
55 | **NOTE**: If you want, you can skip the first step (building the Wiki redirect file) by downloading a prebuilt pickle of the redirect dictionary from Google Drive. This is by far the slowest step (it can take up to 24 hours on a slow machine), but there are security concerns about downloading pickle files from the internet. If you're comfortable with running an untrusted pickle file, you can download the file from [here](https://drive.google.com/file/d/1zJviHKAm0bQH9xaq5p-dUrVnknDlFgJK/view?usp=sharing) and place it in the `setup` directory.
56 | 
57 | To run the entire process, you can run the following command:
58 | 
59 | ```bash
60 | bash create_index.sh
61 | ```
62 | 
63 | This will run all commands to:
64 | 
65 | - set up Elasticsearch
66 | - download the English Wikipedia dump
67 | - go through the Wikipedia dump and identify all redirects (see note above--this is slow)
68 | - store those redirects in Redis for easy querying
69 | - go through Wikipedia again,
70 |   - parsing each article
71 |   - looking up alternative names in Redis
72 |   - loading the formatted article into Elasticsearch
73 | 
74 | Alternatively, you can run each command in the bash file separately in a terminal so you don't have to re-run everything if you encounter an error somewhere.
75 | 
76 | ## 2. Updating the index
77 | 
78 | To update the index with a new copy of Wikipedia, you should be able to do the following:
79 | 
80 | 1. delete the existing Wikipedia index, but don't destroy the entire Elasticsearch container (I did this, and then realized that I'd also nuked the Geonames index, which thankfully only takes about 30 minutes to rebuild)
81 | 2. delete the old Wikipedia file and re-download: `wget "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"`
82 | 3. Re-do the Elasticsearch mapping with the new wiki_mapping.json file that separates out alternative names and redirects (and changes the way the indexing is done): `curl -XPUT 'localhost:9200/wiki' -H 'Content-Type: application/json' -d @wiki_mapping.json`
83 | 4. SKIP the "build links" step: that's already taken care of with the downloaded file. This step is absurdly slow.
84 | 5. Load the redirects file into Redis: `python load_wiki_es.py load_redis`
85 | 6. Load Wikipedia into elasticsearch: `python load_wiki_es.py load_es`
86 | 
87 | 
88 | **A few caveats**:
89 | 
90 | - it assumes you'll run Elasticsearch in a Docker container. You may instead want to run it directly, in which case you should remove the Docker step and make sure that it's running on the port that the script expects.
91 | - the Python `requirements.txt` files has specific package version numbers. To prevent overriding existing package versions, you may want to set up a virtual environment to install into.
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/setup/wiki/actor_contrastive_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import pickle
  4 | from tqdm import tqdm
  5 | import pylcs
  6 | import random
  7 | from itertools import combinations
  8 | import itertools
  9 | import jsonlines
 10 | 
 11 | files = os.listdir()
 12 | versions = [int(re.findall("dict_(\d+)\.", i)[0]) for i in files if re.match("redirect_dict", i)]
 13 | 
 14 | with open(f"redirect_dict_{max(versions)}.0.pkl", "rb") as f:
 15 |     redirect_dict = pickle.load(f)
 16 | 
 17 | query = "Donald Trump"
 18 | 
 19 | keys = list(redirect_dict.keys())
 20 | sims = [pylcs.lcs_sequence_length(query, i) for i in tqdm(keys) if i]
 21 | #np.argmax(sims, K=5)
 22 | np.argpartition(sims, -4)[-4:]
 23 | ## Build training data
 24 | 
 25 | 
 26 | redirect_list = list(redirect_dict.items())
 27 | 
 28 | item = redirect_list[33333]
 29 | 
 30 | skip_patt = re.compile(r"#|/")
 31 | disambig_patt = re.compile(r"\(disambiguation\)")
 32 | 
 33 | def clean_entries(entries):
 34 |     entries = [i for i in entries if not re.search(skip_patt, i)]
 35 |     entries = [re.sub(disambig_patt, "", i).strip() for i in entries]
 36 |     entries = list(set(entries))
 37 |     return entries
 38 | 
 39 | pos_data = []
 40 | pos_file = "redirect_sim_pos.jsonl"
 41 | for n, item in tqdm(enumerate(redirect_list), total=len(redirect_list)):
 42 |     entries = [item[0]] + item[1]
 43 |     entries = clean_entries(entries)
 44 |     if len(entries) > 50:
 45 |         entries = random.choices(entries, k=50)
 46 |     res = list(combinations(entries, 2))
 47 |     random.shuffle(res)
 48 |     pos_data.extend(res[:10])
 49 |     if n % 5000 == 0:
 50 |         #print(n)
 51 |         if pos_data:
 52 |             with jsonlines.open(pos_file, "a") as f:
 53 |                 f.write_all(pos_data)
 54 |             pos_data = []
 55 | if pos_data:
 56 |     with jsonlines.open(pos_file, "a") as f:
 57 |         f.write_all(pos_data)
 58 | 
 59 | 
 60 | 140156 * (4328876 / 18106)
 61 | 
 62 | 
 63 | neg_data = []
 64 | neg_file = "redirect_sim_neg.jsonl"
 65 | for n, item in tqdm(enumerate(redirect_list), total=len(redirect_list)):
 66 |     entries = [item[0]] + item[1]
 67 |     entries = clean_entries(entries)
 68 |     for entry in entries:
 69 |         other_item = random.choice(redirect_list)
 70 |         # avoid the vanishing rare chance of sampling the same entry
 71 |         if other_item[0] == item[0]:
 72 |             other_item = random.choice(redirect_list)
 73 |         other_entries = [other_item[0]] + other_item[1]
 74 |         other_entries = clean_entries(other_entries)
 75 |         if other_entries:
 76 |             neg_sample = random.choice(other_entries)
 77 |             neg_data.append((entry, neg_sample))
 78 |     if n % 5000 == 0:
 79 |         #print(n)
 80 |         if neg_data:
 81 |             with jsonlines.open(neg_file, "a") as f:
 82 |                 f.write_all(neg_data)
 83 |             neg_data = []
 84 | if neg_data:
 85 |     with jsonlines.open(neg_file, "a") as f:
 86 |         f.write_all(neg_data)
 87 | 
 88 | 
 89 | ## Wiki version
 90 | 
 91 | conn = setup_es()
 92 | 
 93 | box_types = ['officeholder', 'settlement',  'official post', 'company',
 94 |             'war faction', 'government agency', 'military unit', 'person',
 95 |             'aircraft begin', 'ship begin', 'weapon', 'military person',
 96 |             'politician', 'Minister', 'criminal', 'company']
 97 | 
 98 | 'honorific-prefix'
 99 | 
100 | box_type = box_types[0]
101 | q = {"multi_match": {"query": box_type,
102 |                                         "fields": ['box_type'],
103 |                                         "type" : "phrase"}
104 |                                         }
105 | res = conn.query(q)[0:40].execute()
106 | results = [i.to_dict()['_source'] for i in res['hits']['hits']] 
107 | 
108 | for page in results:
109 | 
110 | 
111 | page = results[4]
112 | 
113 | def page_to_entries(page):
114 |     entries = [page['title']] + page['redirects'] + page['alternative_names']
115 |     if 'infobox' in page.keys():
116 |         if 'name' in page['infobox'].keys():
117 |             entries.append(page['infobox']['name'])
118 |             if 'honorific-suffix' in page['infobox'].keys():
119 |                 nn = page['infobox']['name'] + " " + page['infobox']['honorific-suffix']
120 |                 entries.append(nn)
121 |             if 'honorific-prefix' in page['infobox'].keys():
122 |                 nn = page['infobox']['honorific-prefix'] + " " + page['infobox']['name']
123 |                 entries.append(nn)
124 |             if 'office' in page['infobox'].keys():
125 |                 nn = page['infobox']['office'] + " " + page['infobox']['name']
126 |                 entries.append(nn)
127 |             if 'office1' in page['infobox'].keys():
128 |                 nn = page['infobox']['office1'] + " " + page['infobox']['name']
129 |                 entries.append(nn) 
130 |             if 'rank' in page['infobox'].keys():
131 |                 nn = page['infobox']['rank'] + " " + page['infobox']['name']
132 |                 entries.append(nn) 
133 |     entries = clean_entries(entries)
134 |     return entries
135 | 
136 | 
137 | def make_pos_combos(entries, max_pairs=30):
138 |     # First, limit to 50 redirects
139 |     if len(entries) > 50:
140 |         entries = random.choices(entries, k=50)
141 |     res = list(combinations(entries, 2))
142 |     random.shuffle(res)
143 |     return res[0:max_pairs]
144 | 
145 | def get_close_match(query, max_results=3, conn=conn):
146 |     q = {"multi_match": {"query": query,
147 |                              "fields": ['title', 'redirects'],
148 |                         }}
149 |     res = conn.query(q)[0:max_results].execute()
150 |     results = [i.to_dict()['_source'] for i in res['hits']['hits']] 
151 |     results = [i for i in results if i['title'] != query]
152 |     return results
153 | 
154 | def get_neg_pairs(entries, page):
155 |     other_names = []
156 |     close_matches = get_close_match(page['title'], 5)
157 |     for cm in close_matches:
158 |         nes = page_to_entries(cm)
159 |         other_names.extend(nes)
160 |     
161 |     samp_size = min(len(entries), 2)
162 |     if samp_size == 0:
163 |         return []
164 |     neg_pairs = []
165 |     for n in other_names:
166 |         es = random.sample(entries, samp_size)
167 |         for e in es:
168 |             neg_pairs.append((e, n))
169 |     return neg_pairs
170 | 
171 | all_pos = []
172 | all_neg = []
173 | neg_file = "redirect_sim_neg2.jsonl"
174 | pos_file = "redirect_sim_pos2.jsonl"
175 | 
176 | box_types = ['officeholder', 'settlement',  'official post', 'company',
177 |             'war faction', 'government agency', 'military unit', 'person',
178 |             'aircraft begin', 'ship begin', 'weapon', 'military person',
179 |             'politician', 'Minister', 'criminal', 'company', 'infobox company',
180 |             'country', 'geopolitical organization']
181 | 
182 | 
183 | for box_type in box_types[]:
184 |     q = {"multi_match": {"query": box_type,
185 |                                             "fields": ['box_type'],
186 |                                             "type" : "phrase"}
187 |                                             }
188 |     res = conn.query(q)[0:10000]
189 | 
190 |     for i in tqdm(res):
191 |         page = i.to_dict() 
192 |         entries = page_to_entries(page)
193 |         pos = make_pos_combos(entries)
194 |         neg = get_neg_pairs(entries, page)
195 |     
196 |         all_pos.extend(pos)
197 |         all_neg.extend(neg)
198 |     
199 |         if len(all_pos) > 5000:
200 |             with jsonlines.open(neg_file, "a") as f:
201 |                 f.write_all(all_neg)
202 |             with jsonlines.open(pos_file, "a") as f:
203 |                 f.write_all(all_pos)
204 |             all_pos = []
205 |             all_neg = []
206 | 
207 | #para_terms = ['officer', 'politician', 'diplomat', 'country', 'province', 'municipalities', 'city', 'municipality',  'non-governmental organization']#
208 | para_terms = ["Arab", "Andorra","United Arab Emirates","Afghanistan","Antigua and Barbuda","Anguilla","Albania","Armenia","Angola","Argentina","American Samoa","Austria","Australia","Aruba","Azerbaijan","Bosnia and Herzegovina","Barbados","Bangladesh","Belgium","Burkina Faso","Bulgaria","Bahrain","Burundi","Benin","Saint Barthélemy","Bermuda","Brunei","Bolivia","Brazil","Bahamas","Bhutan","Botswana","Belarus","Belize","Canada","Cocos [Keeling] Islands","Democratic Republic of the Congo","Central African Republic","Congo","Switzerland","Côte d’Ivoire","Cook Islands","Chile","Cameroon","China","Colombia","Costa Rica","Cuba","Cape Verde","Curaçao","Christmas Island","Cyprus","Czech Republic","Germany","Djibouti","Denmark","Dominican Republic","Algeria","Ecuador","Estonia","Egypt","Western Sahara","Eritrea","Spain","Ethiopia","European Union","Finland","Fiji","Falkland Islands","Micronesia","Faroe Islands","France","Gabon","United Kingdom","Grenada","Georgia","Ghana","Gibraltar","Greenland","Gambia","Guinea","Guadeloupe","Equatorial Guinea","Greece","Guatemala","Guam","Guinea-Bissau","Guyana","Hong Kong SAR China","Heard Island and McDonald Islands","Honduras","Croatia","Haiti","Hungary","Indonesia","Ireland","Israel","Isle of Man","India","Iraq","Iran","Iceland","Italy","Jersey","Jamaica","Jordan","Japan","Kenya","Kosovo","Kyrgyzstan","Cambodia","Kiribati","Comoros","Saint Kitts and Nevis","North Korea","South Korea","Kuwait","Cayman Islands","Kazakhstan","Laos","Lebanon","Saint Lucia","Liechtenstein","Sri Lanka","Liberia","Lesotho","Lithuania","Luxembourg","Latvia","Libya","Morocco","Monaco","Moldova","Montenegro","Saint Martin","Madagascar","Marshall Islands","Macedonia","Mali","Myanmar [Burma]","Mongolia","Mauritania","Montserrat","Malta","Mauritius","Maldives","Malawi","Mexico","Malaysia","Mozambique","Namibia","New Caledonia","Niger","Norfolk Island","Nigeria","Nicaragua","Netherlands","Norway","Nepal","Nauru","Niue","New Zealand","Oman","Panama","Peru","French Polynesia","Papua New Guinea","Philippines","Pakistan","Poland","Pitcairn Islands","Puerto Rico","Palestinian Territories","Portugal","Palau","Paraguay","Qatar","Romania","Serbia","Russia","Rwanda","Saudi Arabia","Solomon Islands","Seychelles","Sudan","Sweden","Singapore","Saint Helena","Slovenia","Slovakia","Sierra Leone","San Marino","Senegal","Somalia","Suriname","São Tomé and Príncipe","El Salvador","Syria","Swaziland","Turks and Caicos Islands","Chad","Togo","Thailand","Tajikistan","Tokelau","Timor-Leste","Turkmenistan","Tunisia","Tonga","Turkey","Trinidad and Tobago","Tuvalu","Taiwan","Tanzania","Ukraine","Uganda","United Nations","United States","Uruguay","Uzbekistan","Vatican City","Saint Vincent and the Grenadines","Venezuela","British Virgin Islands","U.S. Virgin Islands","Vietnam","Vanuatu","Wallis and Futuna","Samoa","Yemen","South Africa","Zambia","Zimbabwe"]
209 | for para_term in tqdm(para_terms):
210 |     q = {"multi_match": {"query": para_term,
211 |                                             "fields": ['intro_para']
212 |                                             }}
213 |     res = conn.query(q)[0:10000]
214 |     for i in tqdm(res, leave=False, total=10000):
215 |         page = i.to_dict() 
216 |         entries = page_to_entries(page)
217 |         pos = make_pos_combos(entries)
218 |         neg = get_neg_pairs(entries, page)
219 |     
220 |         all_pos.extend(pos)
221 |         all_neg.extend(neg)
222 |     
223 |         if len(all_pos) > 5000:
224 |             with jsonlines.open(neg_file, "a") as f:
225 |                 f.write_all(all_neg)
226 |             with jsonlines.open(pos_file, "a") as f:
227 |                 f.write_all(all_pos)
228 |             all_pos = []
229 |             all_neg = []
230 | 
231 | with jsonlines.open(neg_file, "a") as f:
232 |     f.write_all(all_neg)
233 | with jsonlines.open(pos_file, "a") as f:
234 |     f.write_all(all_pos)
235 | 
236 |     q = {"multi_match": {"query": "officer",
237 |                                             "fields": ['intro_para'],
238 |                                             }}


--------------------------------------------------------------------------------
/setup/wiki/create_index.sh:
--------------------------------------------------------------------------------
 1 | echo "Installing Redis..."
 2 | sudo apt-get install redis-server
 3 | 
 4 | echo "Starting Docker container and data volume..."
 5 | # create the directory first to avoid permission issues when Docker is running as root
 6 | mkdir $PWD/wiki_index/
 7 | docker run -d -p 127.0.0.1:9200:9200 -e "discovery.type=single-node" -v $PWD/wiki_index/:/usr/share/elasticsearch/data elasticsearch:7.10.1 
 8 | 
 9 | echo "Downloading Wikipedia..."
10 | wget "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"
11 | 
12 | echo "Creating mappings for the fields in the Wikipedia index..."
13 | curl -XPUT 'localhost:9200/wiki' -H 'Content-Type: application/json' -d @wiki_mapping.json
14 | 
15 | echo "Change disk availability limits..."
16 | curl -X PUT "localhost:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
17 | {
18 |   "transient": {
19 |     "cluster.routing.allocation.disk.watermark.low": "10gb",
20 |     "cluster.routing.allocation.disk.watermark.high": "5gb",
21 |     "cluster.routing.allocation.disk.watermark.flood_stage": "4gb",
22 |     "cluster.info.update.interval": "1m"
23 |   }
24 | }
25 | '
26 | 
27 | echo "\nBuilding redirect links..."
28 | python load_wiki_es.py build_links
29 | 
30 | echo "\nBuilding links..."
31 | python load_wiki_es.py load_redis
32 | 
33 | echo "\nLoading Wikipedia into Elasticsearch..."
34 | python load_wiki_es.py load_es
35 | 
36 | echo "Done"
37 | 


--------------------------------------------------------------------------------
/setup/wiki/load_wiki_es.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import elasticsearch
  3 | from elasticsearch import Elasticsearch, helpers
  4 | import mwxml
  5 | import mwparserfromhell
  6 | import re
  7 | from tqdm import tqdm
  8 | from textacy.preprocessing.remove import accents as remove_accents
  9 | from bz2 import BZ2File as bzopen
 10 | import pickle
 11 | import plac
 12 | import os
 13 | import redis
 14 | import json
 15 | import datetime
 16 | 
 17 | import logging
 18 | 
 19 | logger = logging.getLogger()
 20 | handler = logging.FileHandler("wiki_es.log")
 21 | formatter = logging.Formatter(
 22 |         '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
 23 | handler.setFormatter(formatter)
 24 | logger.addHandler(handler)
 25 | logger.setLevel(logging.DEBUG)
 26 | 
 27 | es_logger = elasticsearch.logger
 28 | es_logger.setLevel(elasticsearch.logging.WARNING)
 29 | 
 30 | def get_redirect(page, title=None, text=None):
 31 |     if not title and not text and page:
 32 |         text = next(page).text
 33 |         if not page:
 34 |             logger.debug("not page")
 35 |             return None
 36 |         title = page.title
 37 |         if not page:
 38 |             return None
 39 |     if not text:
 40 |         return None
 41 | 
 42 |     wikicode = mwparserfromhell.parse(str(text))
 43 | 
 44 |     raw_intro = wikicode.get_sections()[0]
 45 |     intro_para = raw_intro.strip_code()
 46 |     if re.match("#?(REDIRECT|redirect)", intro_para):
 47 |         # skip/ignore redirects for now
 48 |         return None
 49 | 
 50 | 
 51 | redirect_pattern = re.compile("#?(REDIRECT|redirect|Redirect)")
 52 | 
 53 | def get_page_redirect(page, title, text):
 54 |     """Returns (original page, new page to redirect to)"""
 55 |     wikicode = mwparserfromhell.parse(text)
 56 |     raw_intro = wikicode.get_sections()[0]
 57 |     if re.match(redirect_pattern, str(raw_intro)):
 58 |         new_page = re.findall(r"\[\[(.+?)\]\]", str(raw_intro))
 59 |         try:
 60 |             new_page = new_page[0]
 61 |         except:
 62 |             return None
 63 |         # Too many false positives come from this splitting. Keep as-is instead, even
 64 |         # if that means it won't get added to any articles.
 65 |         #new_page = new_page.split("#")[0]
 66 |         return (str(title), str(new_page))
 67 | 
 68 | 
 69 | def clean_names(name_list):
 70 |     if not name_list:
 71 |         return []
 72 |     name_list = [re.sub("\|.+?\]\]", "", i).strip() for i in name_list]
 73 |     name_list = [re.sub("\[|\]", "", i).strip() for i in name_list]
 74 |     # There are some weird entries here like "son:"
 75 |     name_list = [i for i in name_list if not i.endswith(":")]
 76 |     de_accent = [remove_accents(i) for i in name_list]
 77 |     name_list = name_list + de_accent
 78 |     name_list = list(set(name_list))
 79 |     return name_list
 80 | 
 81 | 
 82 | def parse_wiki_article(page, title=None, text=None, use_redis=True):
 83 |     """
 84 |     Go through a Wikipedia dump and format the article so it's useful for us.
 85 | 
 86 |     Pull out the article's:
 87 |     - title
 88 |     - short desc: (new!) it's similar to the Wikidata short description
 89 |     - first para
 90 |     - redirects (from Redis)
 91 |     - alternative names (anything bold in the first para)
 92 |     - info box
 93 |     """
 94 |     # These had errors earlier: pull them out separately for inspection.
 95 |     if title in ['Kyle Rittenhouse', 'Dmitry Peskov', 'Warsaw', 'Brasília', 'Beirut', 'Muhammadu Buhari',
 96 |                 'Anil Deshmukh', 'Viktor Orbán']:
 97 |         print(f"found article: {title}")
 98 |         with open(f"error_articles/list/{title}.txt", "w") as f:
 99 |             f.write(text)
100 |     if not title and not text and page:
101 |         if not page:
102 |             logger.debug(f"not page: {title}")
103 |             return None
104 |         text = next(page).text
105 |         title = page.title
106 |     if not text:
107 |         logger.debug(f"No text for {title}")
108 |         return None
109 | 
110 |     # There are a whole bunch of article types that we want to skip
111 |     if title.endswith(".jpg") or title.endswith(".png"):
112 |         logger.debug(f"Skipping image: {title}")
113 |         return None
114 |     if re.search("\-stub", title):
115 |         logger.debug(f"Skipping Stub: {title}")
116 |         return None
117 |     if re.match("(User|Selected anniversaries)", title):
118 |         logger.debug(f"Skipping User: {title}")
119 |         return None
120 |     if re.search("\([Dd]isambiguation\)", title):
121 |         logger.debug(f"Skipping Disambig: {title}")
122 |         return None
123 |     if re.search("Articles for deletion", title):
124 |         logger.debug(f"Skipping For deletion: {title}")
125 |         return None
126 |     if re.match("List ", title):
127 |         logger.debug(f"Skipping List: {title}")
128 |         return None
129 |     if re.match("Portal ", title):
130 |         logger.debug(f"Skipping Portal: {title}")
131 |         return None
132 |     if re.search("Today's featured article", title):
133 |         logger.debug(f"Skipping featured article: {title}")
134 |         return None
135 |     if re.search("Featured article candidates", title):
136 |         logger.debug(f"Skipping featured article candidate: {title}")
137 |         return None
138 |     if title.startswith("Peer review/"):
139 |         logger.debug(f"Skipping peer review article: {title}")
140 |         return None
141 |     if title.startswith("Requests for adminship/"):
142 |         logger.debug(f"Skipping adminship: {title}")
143 |         return None
144 |     if title.startswith("Featured list candidates/"):
145 |         logger.debug(f"Skipping list candidates: {title}")
146 |         return None
147 |     if title.startswith("Sockpuppet investigations/"):
148 |         logger.debug(f"Skipping sockpuppt: {title}")
149 |         return None
150 |     # clean up intro para? [[File:Luhansk raions eng.svg|thumb|100px|Raions of Luhansk]]
151 |     # also delete the leftover alt names parentheses? 
152 |     # "[[File:Luhansk raions eng.svg|thumb|100px|Raions of Luhansk]]\nLuhansk,(, ; , , , ; , ), also known as Lugansk and formerly known as Voroshilovgrad (1935-1958)"    
153 | 
154 |     wikicode = mwparserfromhell.parse(str(text))
155 | 
156 |     raw_intro = wikicode.get_sections()[0]
157 |     intro_para_raw = raw_intro.strip_code()
158 |     # strip out the occasional stuff that slips through
159 |     intro_para = re.sub("(\[\[.+?\]\])", "", intro_para_raw).strip()
160 |     # delete thumbs (not removed by strip_code()):
161 |     intro_para = re.sub("^thumb\|.+?\n", "", intro_para)
162 |     # do it again, the lazy way
163 |     intro_para = re.sub("^thumb\|.+?\n", "", intro_para)
164 |     # delete the first set of paratheses
165 |     intro_para = re.sub("\(.+?\)", "", intro_para, 1)
166 |     if not intro_para:
167 |         logger.debug(f"No intro para for {title}.")
168 |         #logger.debug(f"{wikicode.get_sections()[:2]}")
169 |         return None
170 |     if re.match("#?(REDIRECT|redirect|Redirect)", intro_para):
171 |         logger.debug(f"Detected redirect in first para: {title}")
172 |         # skip/ignore redirects for now
173 |         return None
174 |     if re.search("\*?\n?Category\:", intro_para):
175 |         logger.debug(f"Category: {title}")
176 |         return None
177 |     if intro_para.startswith("Category:"):
178 |         logger.debug(f"Category: {title}")
179 |         return None
180 |     if intro_para.startswith("<noinclude>"):
181 |         logger.debug(f"Sneaky category? {title}")
182 |         return None
183 |     if re.search("may refer to", intro_para[0:100]):
184 |         logger.debug(f"may refer to: {title}")
185 |         return None
186 |     if re.search("most often refers", intro_para[0:100]):
187 |         logger.debug(f"most often refers: {title}")
188 |         return None
189 |     if re.search("most commonly refers", intro_para[0:100]):
190 |         logger.debug(f"most commonly refers: {title}")
191 |         return None
192 |     if re.search("[Pp]ortal\:", intro_para[0:100]):
193 |         logger.debug(f"Portal: {title}")
194 |         return None
195 |     alternative_names = re.findall("'''(.+?)'''", str(raw_intro))
196 | 
197 |     redirects = []
198 |     if use_redis:
199 |         redis_db = redis.StrictRedis(host="localhost", port=6379, db=0, charset="utf-8", decode_responses=True)
200 |         redirects = redis_db.get(title)
201 |         if redirects:
202 |             redirects = redirects.split(";")
203 | 
204 |     if re.match("Categories for", title):
205 |         return None
206 | 
207 |     try:
208 |         short_desc = re.findall("\{\{[Ss]hort description\|(.+?)\}\}", str(raw_intro))[0].strip()
209 |     except:
210 |         logger.debug(f"Error getting short desc for {title}")
211 |         #title_mod = re.sub("/", "_", title)
212 |         #with open(f"error_articles/short_desc/{title_mod}.txt", "w") as f:
213 |         #    f.write(str(raw_intro))
214 |         short_desc = ""
215 | 
216 | 
217 |     params = {"title": title,
218 |              "short_desc": short_desc,
219 |              "intro_para": intro_para.strip(),
220 |              "alternative_names": clean_names(alternative_names),
221 |              "redirects": clean_names(redirects),
222 |              "affiliated_people": [],
223 |              "box_type": None}
224 | 
225 |     for template in wikicode.get_sections()[0].filter_templates():
226 |         if re.search("[Ii]nfobox", template.name.strip()):
227 |             # do it this way to prevent overwriting
228 |             info_box = {p.name.strip(): p.value.strip_code().strip() for p in template.params}
229 |             params['infobox'] = info_box
230 |             params['box_type'] = re.sub("Infobox", "", str(template.name)).strip()
231 |             break
232 | 
233 |     if 'infobox' in params.keys():
234 |         for k in ['name', 'native_name', 'other_name', 'alias', 'birth_name', 'nickname', 'other_names']:
235 |             if k in params['infobox'].keys():
236 |                 newline_alt = [i.strip() for i in params['infobox'][k].split("\n") if i.strip()]
237 |                 new_alt = [j.strip() for i in newline_alt for j in i.split(",")]
238 |                 params['alternative_names'].extend(new_alt)
239 | 
240 |         affiliated_people = []
241 |         for k in ['leaders', 'founded_by', 'founder']:
242 |             if k in params['infobox'].keys():
243 |                 aff_people = [i.strip() for i in params['infobox'][k].split("\n") if i.strip()]
244 |                 aff_people = [j.strip() for i in aff_people for j in i.split(",")]
245 |                 affiliated_people.extend(aff_people) 
246 | 
247 |         params['affiliated_people'] = clean_names(affiliated_people)
248 |         params['alternative_names'] = clean_names(params['alternative_names'])
249 | 
250 | 
251 |     raw_categories = wikicode.get_sections()[-1].strip_code()
252 |     categories = re.findall("Category:(.+?)\n", raw_categories)
253 |     params['categories'] = categories
254 | 
255 |     if 'infobox' in params.keys():
256 |         for k in ['map']:
257 |             if k in params['infobox'].keys():
258 |                 del params['infobox'][k]
259 |     
260 |     params['update'] = datetime.date.today().isoformat()
261 |     logger.debug(f"Good article: {title}")
262 | 
263 |     if title in ['Kyle Rittenhouse', 'Dmitry Peskov', 'Warsaw', 'Brasília', 'Beirut', 'Muhammadu Buhari',
264 |                 'Anil Deshmukh', 'Viktor Orbán']:
265 |         with open(f"error_articles/list/{title}.json", "w") as f:
266 |             json.dump(params, f)
267 |     return params
268 | 
269 | def wrapper_loader(title, text, page=None):
270 |     res = parse_wiki_article(page, title, text)
271 |     if not res:
272 |         return None
273 |     action = {"_index" : "wiki",
274 |                       #"_id" : res['title'], # it turns out the titles aren't globally unique, so can't use as an ID
275 |                       "_source" : res}
276 |     return action
277 | 
278 | 
279 | def load_batch_es(page_batch, p, es):
280 |     actions = [p.apply_async(wrapper_loader, (title, text)) for title, text in page_batch if title]
281 |     actions = [i.get() for i in tqdm(actions, leave=False) if i]
282 |     actions = [i for i in actions if i]
283 |     try:
284 |         helpers.bulk(es, actions, chunk_size=-1, raise_on_error=False)
285 |         logger.info("Bulk loading success")
286 |     except Exception as e:
287 |         logger.info(f"Error in loading Wiki batch!!: {e}. Loading stories individually...")
288 |         for i in actions:
289 |             try:
290 |                 response = helpers.bulk(es, i, chunk_size=-1, raise_on_error=False)
291 |                 if response[1]:
292 |                     logger.info(f"Error on loading story {i}: {response[1]}")
293 |             except Exception as e:
294 |                 logger.info(f"Skipping single Wiki story {e}")
295 | 
296 | 
297 | 
298 | def redirect_wrapper(title, text):
299 |     redir = get_page_redirect(None, title, text)
300 |     if redir:
301 |         if redir[1] not in redirect_dict.keys():
302 |             redirect_dict[redir[1]] = [redir[0]]
303 |         else:
304 |             redirect_dict[redir[1]] = list(set(redirect_dict[redir[1]] + [redir[0]]))
305 | 
306 | 
307 | def read_clean_redirects():
308 |     files = os.listdir()
309 |     versions = [int(re.findall("dict_(\d+)\.", i)[0]) for i in files if re.match("redirect_dict", i)]
310 |     max_file = f"redirect_dict_{max(versions)}.0.pkl"
311 |     logger.info(f"Loading {max_file} into redis")
312 |     with open(max_file, "rb") as f:
313 |         redirect_dict = pickle.load(f)
314 | 
315 |     # Merge lowercase versions of keys with their non-lowercase
316 |     #len = 1132887
317 |     del_list = []
318 |     for k in redirect_dict.keys():
319 |         if k.lower() in redirect_dict.keys():
320 |             redirect_dict[k] = redirect_dict[k] + redirect_dict[k.lower()]
321 |             del_list.append(k.lower())
322 | 
323 |     for d in del_list:
324 |         if d in redirect_dict.keys():
325 |             del redirect_dict[d]
326 |     # len = 1106119
327 |     return redirect_dict
328 |     
329 | 
330 | @plac.pos('process', "Which process to run?", choices=['build_links', 'load_redis', 'load_es'])
331 | @plac.pos('file', "Wikiepdia dump location")
332 | @plac.pos('es_batch', "Elasticsearch batch size")
333 | @plac.pos('threads', "number of threads to use")
334 | def process(process, file="enwiki-latest-pages-articles.xml.bz2", es_batch=5000, threads=10):
335 |     p = multiprocessing.Pool(threads)
336 |     logger.info(f"Reading from {file}")
337 |     if re.search("bz2", file):
338 |         dump = mwxml.Dump.from_file(bzopen(file, "r"))
339 |     else:
340 |         dump = mwxml.Dump.from_file(file)
341 | 
342 |     #dump = mwxml.Dump.from_file(open("Wikipedia-protest-export.xml"))
343 |     # 1 core   = 11.077 total
344 |     # 5 cores  = 3.254 total
345 |     # 10 cores = 3.075 total
346 |     
347 |     if process == "build_links":
348 |         redirect_dict = {}
349 |         logger.info("Building redirect link dictionary...")
350 |         page_batch = []
351 |         for n, page in tqdm(enumerate(dump), total=22373694):
352 |             if n % 1000000 == 0 and n > 0:
353 |                 k = n / 1000000
354 |                 with open(f"redirect_dict_{k}.pkl", "wb") as f:
355 |                     pickle.dump(redirect_dict, f)
356 |                     logger.info(f"Dumped at {k} x 1,000,000")
357 |                 #break
358 |             #    continue
359 |             if page:
360 |                 page_batch.append((page.title, next(page).text))
361 |             if len(page_batch) % 5000 == 0:
362 |                 if re.search("bz2", file):
363 |                     #actions = [p.apply_async(wrapper_loader, (title, text)) for title, text in page_batch if page]
364 |                     actions = [p.apply_async(get_page_redirect, (None, title, text)) for title, text in page_batch if page]
365 |                 else:
366 |                     actions = [p.apply_async(get_page_redirect, (page, None, None)) for page in page_batch if page]
367 |                 actions = [i.get() for i in tqdm(actions, leave=False) if i]
368 |                 #actions = [i for i in actions if i]
369 |                 for redir in tqdm(actions, leave=False):
370 |                     if not redir:
371 |                         continue
372 |                     if redir[1] not in redirect_dict.keys():
373 |                         redirect_dict[redir[1]] = [redir[0]]
374 |                     else:
375 |                         redirect_dict[redir[1]] = list(set(redirect_dict[redir[1]] + [redir[0]]))
376 |                 page_batch = []
377 |         # get the final batch
378 |         # This one isn't wrapped in a function to make sure redirect_dict stays in the right scope
379 |         for redir in tqdm(actions, leave=False):
380 |             if not redir:
381 |                 continue
382 |             if redir[1] not in redirect_dict.keys():
383 |                 redirect_dict[redir[1]] = [redir[0]]
384 |             else:
385 |                 redirect_dict[redir[1]] = list(set(redirect_dict[redir[1]] + [redir[0]]))
386 |         with open(f"redirect_dict_{k+1}.pkl", "wb") as f:
387 |             pickle.dump(redirect_dict, f)
388 | 
389 | 
390 |     elif process == "load_redis":
391 |         logger.info("Reading redirect dict...")
392 |         redirect_dict = read_clean_redirects()
393 |         redis_db = redis.StrictRedis(host="localhost", port=6379, db=0)
394 |         pipe = redis_db.pipeline()
395 |         for n, item in tqdm(enumerate(redirect_dict.items()), total=len(redirect_dict)):
396 |             k, v = item
397 |             v_str = ";".join(v)
398 |             pipe.set(k, v_str)
399 |             if n % 1000 == 0:
400 |                 pipe.execute()
401 |         # get the final batch
402 |         pipe.execute()
403 | 
404 |     elif process == "load_es":
405 |         logger.info("Loading Wikipedia into Elasticsearch")
406 |         es = Elasticsearch(urls='http://localhost:9200/', timeout=60, max_retries=2)
407 | 
408 |         page_batch = []
409 |         for n, page in tqdm(enumerate(dump), total=21726007):
410 |             if page:
411 |                 page_batch.append((page.title, next(page).text))
412 |             if len(page_batch) % es_batch == 0:
413 |                 #logger.debug(f"Loaded {page.title}")
414 |                 load_batch_es(page_batch, p, es)
415 |                 page_batch = []
416 |         # load final batch
417 |         load_batch_es(page_batch, p, es)
418 | 
419 | 
420 | if __name__ == '__main__':
421 |     plac.call(process)
422 | 
423 | 
424 | 


--------------------------------------------------------------------------------
/setup/wiki/load_wiki_scratch.py:
--------------------------------------------------------------------------------
 1 | file = "enwiki-latest-pages-articles.xml.bz2"
 2 | dump = mwxml.Dump.from_file(bzopen(file, "r"))
 3 | 
 4 | results = []
 5 | title_list = ['Anil Deshmukh', 'Mamata Banerjee', 'Sameer Wankhede', 'Brasilia', 'Kyle Rittenhouse', 'Ahmad Massoud', 'Ariel Henry', 'Augusto Aras',
 6 | 'Geneva', 'Beirut']
 7 | 
 8 | for n, page in tqdm(enumerate(dump), total=100):
 9 |     title = page.title
10 |     if title not in title_list:
11 |         continue
12 |     text = next(page).text
13 |     r = parse_wiki_article(page, title, text) 
14 |     results.append(r)
15 |     print(title)
16 |     #if n > 100:
17 |     #    break
18 |     
19 |     , next(page).text)
20 |     parse_wiki_article()
21 | 
22 | 
23 | 
24 | raw = """{{Short description|Ancient Greek city in Anatolia}}\n{{Use dmy dates|date=April 2020}}\n{{Infobox ancient site\n|name = Anazarbus\n|native_name = Anavarza {{in lang|tr}}\n|alternate_name = Caesarea, Justinopolis\n|image = Anavarza_Triumphal_arch_in_Anazarbus_2754.jpg\n|alt = \n|caption = The triumphal arch of Anazarbus was later converted to the city\'s South Gate.\n|map_type = Turkey\n|map_alt = \n|map_size = 270\n|coordinates = {{coord|37|15|50|N|35|54|20|E|display=inline,title}}\n|location = [[Adana Province]], Turkey\n|region = [[Cilicia]]\n|type = Settlement\n|part_of = \n|length = \n|width = \n|area = \n|height = \n|builder = \n|material = \n|built = \n|abandoned = \n|epochs = <!-- actually displays as "Periods" -->\n|cultures = \n|dependency_of = \n|occupants = \n|event = \n|excavations = \n|archaeologists = \n|condition = \n|ownership = \n|management = \n|public_access = \n|website = <!-- {{URL|example.com}} -->\n|notes = \n}}\n\n[[File:Anazarbe_vue_générale_1.jpg|thumb|right|300px|General view of the site]]\n[[Image:Anazarbus clikya west gate and anvarza castle.JPG|thumb|right|200px|Anazarbus West Gate]]\n\'\'\'Anazarbus \'\'\' ({{lang-grc|Ἀναζαρβός}}, medieval \'\'\'Ain Zarba\'\'\'; modern \'\'\'Anavarza\'\'\'; {{lang-ar|عَيْنُ زَرْبَة}}) was an ancient [[Cilicia]]n city. Under the late Roman Empire, it was the capital of [[Cilicia Secunda]]. [[Roman emperor]] [[Justinian I]] rebuilt the city in 527 after a strong earthquake hit it. It was destroyed in 1374 by the forces of [[Mamluk Empire]], after their conquest of Armenia.\n\n"""
25 | raw = """'{{Short description|Ethnic group in Japan and Russia}}\n{{For|the ethnic group of Western China|Äynu people}}\n{{Use mdy dates|date=April 2020}}\n{{Infobox ethnic group\n| group            = Ainu\n| image            = File:Ainu Marriage.jpg \n| image_alt        = \n| caption          = Ainu at a traditional marriage ceremony in [[Hokkaido]].\n| population       = {{plainlist|\n* 25,000\n* (Japanese government estimate, 2002)\n* ≥200,000\n* (Unofficial estimate)<ref name="Poisson, B 2002, p.5">{{cite book|last=Poisson|first=Barbara Aoki|year=2002|title=The Ainu of Japan|publisher=Lerner Publications|location=Minneapolis|page=[https://archive.org/details/ainuofjapan00pois/page/5 5]|isbn=978-0-82254-176-9|url-access=registration|url=https://archive.org/details/ainuofjapan00pois/page/5}}</ref>"""
26 | raw = """"{{short description|Political philosophy and movement}}\n{{other uses}}\n{{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}}\n{{distinguish|Anarchy}}\n{{pp-semi-indef}}\n{{good article}}\n{{use British English|date=August 2021}}\n{{use dmy dates|date=August 2021}}\n{{anarchism sidebar}}\n{{basic forms of government}}\n'"""
27 | re.findall("\{\{Short description\|(.+?)\}\}", raw)[0]
28 | 
29 | 
30 | raw = """thumb|Main amethyst-producing countries\n\nAmethyst is a violet variety of quartz. The name comes from the Koine Greek αμέθυστος amethystos from α- a-, "not" and μεθύσκω (Ancient Greek)"""
31 | raw = """\n\nAmethyst is a violet variety of quartz. The name comes from the Koine Greek αμέθυστος amethystos from α- a-, "not" and μεθύσκω (Ancient Greek)"""
32 | re.sub("^thumb\|.+?\n", "", raw)


--------------------------------------------------------------------------------
/setup/wiki/requirements.txt:
--------------------------------------------------------------------------------
1 | elasticsearch>+7.16.2,<8.0
2 | elasticsearch-dsl>=7.4.0,<8.0
3 | mwparserfromhell>=0.6.3,<1.0
4 | mwtypes>=0.3.2,<1.0
5 | mwxml>=0.3.3,<1.0
6 | tqdm
7 | textacy>=0.12.0,<1.0
8 | redis>=4.1.0,<5.0


--------------------------------------------------------------------------------
/setup/wiki/wiki_mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "settings" : {
 3 |         "number_of_shards" : 1,
 4 |         "number_of_replicas" : 0
 5 |     },
 6 |     "mappings" : {
 7 |         "properties" : {
 8 |           "categories" : {"type" : "keyword", "index": "true"},
 9 |           "intro_para" : {"type" : "text"},
10 |           "title" : {"type" : "text"},
11 |           "alternative_names" : {"type" : "text", "similarity" : "BM25",
12 |                                 "norms": true},
13 | 	  "redirects" : {"type" : "text", "similarity" : "boolean",
14 |                                 "norms": true},
15 |           "box_type" : {"type" : "keyword", "index":  "true"},
16 | 	  "infobox": {"type": "flattened"}
17 |           }
18 |        }
19 | }
20 | 


--------------------------------------------------------------------------------