├── .gitignore
├── README.md
├── collector.py
├── datasets
    ├── 200-gmm.jsonl
    └── bgmm-p200-c12057022.jsonl
├── input_entry.py
├── requirements.txt
├── results
    ├── img
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png
    │   ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png
    │   ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png
    │   ├── gmm_contour_plot.png
    │   ├── gmm_likelihood_world.png
    │   ├── gmm_user_summary_S300_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png
    │   ├── gmm_user_summary_S38_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N218_VF-NON-GEO_2022-11-24_N218_2022-11-24.png
    │   ├── gmm_user_summary_S500_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png
    │   ├── text_map_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png
    │   ├── text_map_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png
    │   ├── text_map_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png
    │   └── text_map_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png
    ├── metric
    │   ├── EF-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-01-25.txt
    │   ├── EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-NON-GEO_2023-02-09.txt
    │   ├── EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-TEXT-ONLY_2023-02-09.txt
    │   ├── U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt
    │   ├── U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-29.txt
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-22.txt
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2023-02-19_metric_N300000_VF-NON-GEO_2023-02-21.txt
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-24_metric_N300000_VF-NON-GEO_2022-11-24.txt
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt
    │   ├── U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt
    │   ├── U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt
    │   ├── U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-08.txt
    │   ├── U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-29_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt
    │   ├── U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt
    │   ├── U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt
    │   ├── U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt
    │   ├── U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-19.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-05.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-05.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N100_VF-NON-GEO_2022-11-17.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-20.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-24.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-24.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-NON-GEO_2023-02-24.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-TEXT-ONLY_2023-02-24.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100_VF-NON-GEO_2022-11-17_metric_N100_VF-NON-GEO_2022-11-22.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-23.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-23.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-23.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-24.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-10-31_metric_N300000_VF-NON-GEO_2022-11-22.txt
    │   ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt
    │   ├── U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-09.txt
    │   ├── U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-09.txt
    │   ├── U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt
    │   ├── U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-28.txt
    │   ├── U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt
    │   ├── U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt
    │   ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N0e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N50_VF-NON-GEO_2022-11-22.txt
    │   ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-23.txt
    │   ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-23.txt
    │   ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-NON-GEO_2023-02-24.txt
    │   └── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-TEXT-ONLY_2023-02-24.txt
    └── val-data
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N1000_2022-10-25.jsonl
    │   └── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17.jsonl
├── runs
    ├── prob
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs
    │   │   └── events.out.tfevents.1665082080.gpu127.3070880.0
    │   └── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs
    │   │   └── events.out.tfevents.1665062385.gpu118.2632089.0
    └── spat
    │   ├── U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-09_logs
    │       └── events.out.tfevents.1665313776.gpu113.3164054.0
    │   └── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-23_logs
    │       └── events.out.tfevents.1666523343.gpu148.1960620.0
├── supplementary_resources
    ├── article_draft.pdf
    ├── img
    │   ├── loss-graph-prob.png
    │   ├── loss-graph-spat.png
    │   ├── map-density.png
    │   ├── model-train.png
    │   ├── mop-loss.png
    │   ├── prediction-example.png
    │   ├── sop-loss.png
    │   └── total-loss.png
    └── scripts
    │   ├── bash
    │       ├── collector.sh
    │       ├── data.sh
    │       └── train.sh
    │   └── python
    │       ├── bert_train.py
    │       ├── camambert-test.py
    │       ├── coords_plots.py
    │       ├── data-from-test.py
    │       ├── dev-loss-func-test.py
    │       ├── geotext-dataframe.py
    │       ├── hf_repo.py
    │       ├── json_split.py
    │       ├── loss_graph_prob.py
    │       ├── loss_graph_spat.py
    │       ├── ner-gazetteer-test.py
    │       ├── old_project_examples
    │           ├── extract_tweets_with_smileys.py
    │           ├── train_sentiment_classifier.py
    │           └── tweet_utils.py
    │       └── transformers-tutorial-test.py
├── text_result.py
├── train_bert.py
├── utils
    ├── benchmarks.py
    ├── cosine_scheduler.py
    ├── model_trainer.py
    ├── prediction.py
    ├── regressor.py
    ├── result_manager.py
    ├── result_visuals.py
    └── twitter_dataset.py
└── valid_data.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | 
  3 | .idea/
  4 | 
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | cover/
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | .pybuilder/
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | #   For a library or package, you might want to ignore these files since the code is
 91 | #   intended to run in multiple environments; otherwise, check them in:
 92 | # .python-version
 93 | 
 94 | # pipenv
 95 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 96 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 97 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 98 | #   install all needed dependencies.
 99 | #Pipfile.lock
100 | 
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 | 
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 | 
108 | # SageMath parsed files
109 | *.sage.py
110 | 
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 
138 | # pytype static type analyzer
139 | .pytype/
140 | 
141 | # Cython debug symbols
142 | cython_debug/
143 | 
144 | 
145 | # ---- project specific
146 | 
147 | # model files
148 | *.pth
149 | *.bin
150 | /models/hf/
151 | 
152 | # logs
153 | /runs/
154 | 
155 | # sensitive data
156 | /results/val-data/
157 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Geolocation Prediction BERT model
  3 | 
  4 | This project is aimed to solve the tweet/user geolocation prediction task and provide a flexible methodology for the geotagging of textual big data. The suggested approach implements neural networks for natural language processing (NLP) to estimate the location as coordinates (longitude, latitude) and two-dimensional Gaussian Mixture Models (GMMs). The scope of proposed models has been finetuned on a Twitter dataset using pretrained Bidirectional Encoder Representations from Transformers (BERT) as a base model. 
  5 | 
  6 | [Predicting the Geolocation of Tweets Using BERT-Based Models Trained on Customized Data](https://arxiv.org/pdf/2303.07865.pdf) - paper pre-print on arXiv 
  7 | 
  8 | [geo-bert-multilingual](https://huggingface.co/k4tel/geo-bert-multilingual) - repository on HuggingFace of the best model (Probabilistic, 5 outcomes, NON-GEO + GEO-ONLY) trained on the worldwide Twitter dataset
  9 | 
 10 | ## Project structure
 11 | 
 12 | - **datasets** - source folder for the input dataset files used during training and evaluation. For correct reading, the format of the files should be .jsonl containing "lon", "lat", "text", "user" and "place" columns (JSON object fields).
 13 | 
 14 | - **models** - folder containing files of local models and checkpoints in .pth format.
 15 | 
 16 | - **results** - folder for output files such as images, evaluated datasets, and performance metric reports. 
 17 | 
 18 | - **utils** - folder containing vital utility python classes
 19 |     - `benchmark.py` - loss function computation and Tensorboard log of training metrics
 20 |     - `cosine_scheduler.py` - [Cyclic Cosine Decay Learning Rate Scheduler](https://github.com/abhuse/cyclic-cosine-decay)
 21 |     - `twitter_dataset.py` - dataset wrapper class implements features forming, tokenization, and creation of PyTorch dataloaders
 22 |     - `regressor.py` - linear regression wrapper layer for BERT base models
 23 |     - `result_manager.py` - postprocessing of model outputs, writing and reading of evaluation results .jsonl files, performance metrics computation
 24 |     - `result_visuals.py` - visualization of results on matplotlib plots 
 25 |     - `prediction.py` - single text prediction routine
 26 |     - `model_trainer.py` - training and evaluation of the models
 27 | 
 28 | - `train_bert.py` - command line parameters input, entry point for training and evaluation
 29 | - `input_entry.py` - entry point for single text prediction using local or HF repository models
 30 | 
 31 | Additional:
 32 | 
 33 | - **runs** - folder for storing training Tensorboard log files 
 34 | 
 35 | - **supplementary_resources** - folder containing testing and development python scripts, and bash scripts for running jobs on a cluster with slurm management system 
 36 | 
 37 | - `valid_data.py` - shortcut for results management and visualization
 38 | - `collector.py` - parsing of Twitter database files to collect dataset files
 39 | 
 40 | ## Usage/Examples
 41 | 
 42 | To run the project locally you can clone this project with:
 43 | 
 44 | ```bash
 45 | git clone https://github.com/K4TEL/geo-twitter.git
 46 | ```
 47 | 
 48 | Then, in your python environment run:
 49 | 
 50 | ```bash
 51 | pip install -r requirements.txt
 52 | ```
 53 | 
 54 | ### Training
 55 | 
 56 | **NOTE!** To run finetuning training place dataset file (.jsonl) containing "lon", "lat", "text", "user" and "place" columns (JSON object fields, no headers required) into the **datasets** folder. 
 57 | Then change the dataset file name in `train_bert.py` manually or by passing `-d <dataset_filename>.jsonl` argument. 
 58 | 
 59 | To launch the finetuning training with default hyperparameters run:
 60 | 
 61 | ```bash
 62 |   python train_bert.py --train
 63 | ```
 64 | 
 65 | You can change default hyperparameters manually in `train_bert.py` or pass command line arguments by using predefined flags. 
 66 | The list of all flags could be found in the same entry point file.
 67 | 
 68 | In practice, learning rate, scheduler type, number of epochs, loss function parameters and target columns should remain the same. 
 69 | Commonly changeable parameters include number of outcomes, covariance type, features, dataset file name, training dataloader size, batch size and log step.
 70 | 
 71 | During finetuning, training metrics and test metrics (calculated at the end of each epoch) are written to the **runs** folder.
 72 | The tracking of models performance is implemented using the Tensorboard python library.
 73 | Model files and their checkpoints are saved to the **models** directory automatically.  
 74 | 
 75 | ### Evaluation
 76 | 
 77 | **NOTE!** To run evaluation place a dataset file into the **datasets** folder. 
 78 | And make sure you have a file of the finetuned model in .pth format in the **models** directory.
 79 | 
 80 | To launch the evaluation with default settings run:
 81 | 
 82 | ```bash
 83 |   python train_bert.py --eval
 84 | ```
 85 | 
 86 | In this case, the model file would be chosen automatically according to the file name prefix formed from the preset hyperparameters. 
 87 | To pick the model manually you should adjust hyperparameters (number of outcomes, covariance type, features, loss function type) to match the previously finetuned model and run:
 88 | 
 89 | ```bash
 90 |   python train_bert.py --eval -m <model_filename>
 91 | ```
 92 | 
 93 | Commonly changeable parameters for the evaluation are dataset file name, validation dataloader size and model file name.
 94 | 
 95 | To perform per user evaluation use `-vu -v <N>` flags that will pick N users with the highest number of samples from the dataset. 
 96 | In this case, performance metrics computation takes average per user values rather than average per tweet. 
 97 | Note that only probabilistic models using GMMs could summarize multiple per tweet predictions.  
 98 | 
 99 | The results of evaluation are written to the .jsonl dataset file containing input and output of the model. 
100 | By default, performance metrics are calculated in the end and written to a short report file of .txt format. 
101 | The visualization of error distance density and its cumulative distribution per outcome are drawn to .png files.
102 | 
103 | Using `valid_map.py` you can read saved predictions files and use visualization functions more easily.
104 | 
105 | All outputs of the evaluation are stored in the **results** folder.
106 | 
107 | ### Prediction
108 | 
109 | **NOTE!** To run single text prediction you should place .pth finetuned model files in the **models/final** directory.
110 | 
111 | To launch the prediction with default settings run:
112 | 
113 | ```bash
114 |     python input_entry.py
115 | ```
116 | Parameters like number of outcomes, probabilistic or geospatial model type, local model file and text could be specified by flags:
117 | 
118 | ```bash
119 |     python input_entry.py -m <model_filename> -t <text>
120 | ```
121 | 
122 | ## Support
123 | 
124 | For support, email lutsai.k@gmail.com
125 | 


--------------------------------------------------------------------------------
/collector.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | import pandas as pd
  4 | import os, glob
  5 | 
  6 | country_codes = ["CA", "GB", "FR"]
  7 | 
  8 | def parse_geo(obj):
  9 |     print(obj['coordinates']["coordinates"])
 10 |     coord_long = obj['coordinates']["coordinates"][0]
 11 |     coord_lat = obj['coordinates']["coordinates"][1]
 12 |     return coord_long,coord_lat
 13 | 
 14 | 
 15 | def parse_user(obj):
 16 |     location = obj['user']['location'] if obj["user"]["location"] else ""
 17 |     username = obj['user']['name'] if obj["user"]["name"] else ""
 18 |     screen = obj['user']['screen_name'] if obj["user"]["screen_name"] else ""
 19 |     description = obj['user']['description'] if obj["user"]["description"] else ""
 20 |     user = f"{username} {screen} {description} {location}"
 21 |     return user
 22 | 
 23 | 
 24 | def parse_place(obj):
 25 |     full = obj['place']['full_name'] if obj['place']['full_name'] else ""
 26 |     country = obj['place']['country'] if obj['place']['country'] else ""
 27 |     code = obj['place']['country_code'] if obj['place']['full_name'] else ""
 28 |     name = obj['place']['name'] if obj['place']['country_code'] else ""
 29 |     type = obj['place']['place_type'] if obj['place']['place_type'] else ""
 30 |     place = f"{country} {type} {name} {full} {code}"
 31 |     return place, code
 32 | 
 33 | 
 34 | def parse_tweet(obj):
 35 |     text = obj['text']
 36 |     time = obj['created_at']
 37 |     lang = obj['lang'] if obj['lang'] else ""
 38 |     return text, time, lang
 39 | 
 40 | 
 41 | def parse_train(fid):
 42 |     known_ids = set()
 43 | 
 44 |     for line in fid:
 45 |         if not line:
 46 |           continue
 47 |         try:
 48 |           obj = json.loads(line)
 49 |         except (json.decoder.JSONDecodeError, TypeError):
 50 |           print("ERROR: entry wasn't a dictionary. skipping.", file=sys.stderr)
 51 |           continue
 52 | 
 53 |         try:
 54 |           if 'id_str' not in obj:
 55 |             print("ERROR: 'id' field not found in tweet", file=sys.stderr)
 56 |             continue
 57 |           if 'place' not in obj:
 58 |             print("ERROR: 'place' field not found in tweet", file=sys.stderr)
 59 |             continue
 60 |           if 'user' not in obj:
 61 |             print("ERROR: 'user' field not found in tweet", file=sys.stderr)
 62 |             continue
 63 |           if 'coordinates' not in obj:
 64 |             print("ERROR: 'coordinates' field not found in tweet", file=sys.stderr)
 65 |             continue
 66 |           if 'created_at' not in obj:
 67 |             print("ERROR: 'created_at' field not found in tweet {}".format(tweet['id']), file = sys.stderr)
 68 |             continue
 69 | 
 70 |         except TypeError:
 71 |             print("ERROR: not a dict?", line, obj, file=sys.stderr)
 72 |             continue
 73 | 
 74 |         if not obj["coordinates"]:
 75 |             continue
 76 |         if not obj["coordinates"]["coordinates"]:
 77 |             continue
 78 |         if not obj["place"]:
 79 |             continue
 80 |         # if obj["place"]["country_code"] not in country_codes:
 81 |         #     continue
 82 |         if not obj["user"]:
 83 |             continue
 84 | 
 85 |         if obj['id_str'] in known_ids: # duplicate
 86 |           continue
 87 | 
 88 |         text, time, lang = parse_tweet(obj)
 89 |         long, lat = parse_geo(obj)
 90 |         place, code = parse_place(obj)
 91 |         user = parse_user(obj)
 92 |         known_ids.add(id)
 93 | 
 94 |         yield (long, lat, text, time, lang, code, place, user)
 95 | 
 96 | 
 97 | def read_train(filename):
 98 |     print("Reading data from:", filename)
 99 |     with open(filename, encoding='utf-8') as fid:
100 |       lines = parse_train(fid)
101 |       longs, lats, texts, times, langs, codes, places, users = zip(*lines)
102 | 
103 |     data_geo = {
104 |         'lon':longs,
105 |         'lat':lats,
106 |         'time':times,
107 |         'texts':texts,
108 |         'lang':langs,
109 |         'code':codes,
110 |         'place':places,
111 |         'user':users
112 |     }
113 | 
114 |     print("Training set of ===", len(longs), "=== samples is collected")
115 | 
116 |     return pd.DataFrame(data_geo)
117 | 
118 | 
119 | def write_records(file, df):
120 |     with open(file, "w") as f:
121 |         df.to_json(f, orient='records', lines=True)
122 |     print("Data written to file:", file)
123 | 
124 | 
125 | def combine(file):
126 |     os.chdir(os.path.dirname(__file__) + r"/filtered_json")
127 | 
128 |     extension = 'txt'
129 |     all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
130 | 
131 |     combined_txt = pd.concat([pd.read_json(path_or_buf=f, lines=True) for f in all_filenames ])
132 | 
133 |     #os.chdir(os.path.dirname(__file__))
134 | 
135 |     with open(file, "w") as f:
136 |         combined_txt.to_json(file, orient='records', lines=True)
137 |     print(f"Data from {len(all_filenames)} files written to common dataset: {file}")
138 | 
139 | 
140 | test_input_file = '/run/user/1005618/gvfs/smb-share:server=archive3.ist.local,share=group/chlgrp/twitter-collection-2022/twitter-2022-01-25.txt'
141 | output_folder = "datasets/"
142 | 
143 | 
144 | def main(concat_to_one=False):
145 |     try:  # 1 arg - data input (txt)
146 |         filename = sys.argv[1]
147 |     except IndexError:
148 |         filename = test_input_file
149 |     print(f"Input file: {filename}")
150 | 
151 |     try:  # 2 arg - filtered data output (jsonl)
152 |         output_filename = sys.argv[2]
153 |     except IndexError:
154 |         head, tail = os.path.split(filename)
155 |         output_filename = output_folder + tail
156 |         open(output_filename, 'w').close()  # if not exists
157 |     print(f"Output file: {output_filename}")
158 | 
159 |     # manual testing
160 |     # df_geo = read_train(filename)
161 |     # write_records(output_filename, df_geo)
162 | 
163 |     try:
164 |         df_geo = read_train(filename)
165 |         write_records(output_filename, df_geo)
166 |     except Exception as e:
167 |         print("Couldn't form dataset:", e)
168 | 
169 |     if concat_to_one:  # if all .txt per-day files are parsed - combine to single json
170 |         combine("ca-twitter-2022.jsonl")
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     main()
175 | 


--------------------------------------------------------------------------------
/input_entry.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from utils.prediction import *
 3 | from utils.regressor import *
 4 | 
 5 | # Entry point for prediction from text (model .pth files needed)
 6 | 
 7 | local_ww_models = {
 8 |     "gsop": "G-NON-GEO+GEO-ONLY-O1",
 9 |     "gmop": "G-NON-GEO+GEO-ONLY-O5",
10 |     "psop": "P-NON-GEO+GEO-ONLY-O1",
11 |     "pmop": "P-NON-GEO+GEO-ONLY-O5"
12 | }
13 | 
14 | outcomes = 5  # 1 or 5
15 | prob = True  # True or False
16 | 
17 | features = ["NON-GEO", "GEO-ONLY"]
18 | 
19 | text_example = "CIA and FBI can track anyone, and you willingly give the data away"
20 | 
21 | local = False
22 | hub_model_prefix = "k4tel/geo-bert-multilingual"
23 | 
24 | 
25 | def main():
26 |     parser = argparse.ArgumentParser(description='Prediction of geolocations')
27 |     parser.add_argument('-o', '--outcomes', type=int, default=outcomes, help="Number of outcomes (lomg, lat) per tweet (default: 5)")
28 |     parser.add_argument('-s', '--spat', action="store_true", help="Use geospatial model (default: probabilistic)")
29 |     parser.add_argument('-l', '--local', action="store_true", help="Use model stored locally")
30 |     parser.add_argument('-m', '--model', type=str, default=None, help='Filename prefix of local model OR HuggingFace repository link')
31 |     parser.add_argument('-t', '--text', type=str, default=None, help='Text to process (max: 300 words)')
32 |     args = parser.parse_args()
33 | 
34 |     weighted = args.outcomes > 1
35 |     covariance = None if args.spat else "spher"
36 | 
37 |     if args.model:  # models/final/<prefix>.pth file; NOTE correct setup is needed
38 |         prefix = args.model
39 |     elif args.model is None and args.local:  # picking local model according to the setup
40 |         if outcomes > 1:
41 |             local_model_prefix = local_ww_models["gmop"] if args.spat else local_ww_models["pmop"]
42 |         else:
43 |             local_model_prefix = local_ww_models["gsop"] if args.spat else local_ww_models["psop"]
44 | 
45 |         prefix = local_model_prefix
46 |     else:  # setup for P-NON-GEO+GEO-ONLY-O5
47 |         weighted = True
48 |         covariance = "spher"
49 |         args.outcomes = 5
50 |         args.spat = False
51 |         prefix = hub_model_prefix
52 | 
53 |     # if not local - loading automatically on BERTregModel init
54 |     model_wrapper = BERTregModel(args.outcomes, covariance, weighted, features, None, prefix) \
55 |         if not args.local else BERTregModel(args.outcomes, covariance, weighted, features)
56 | 
57 |     # if local - loading automatically on ModelOutput init
58 |     prediction = ModelOutput(model_wrapper, prefix, args.local)
59 | 
60 |     print(f"MODEL\tBERT geo regression model is ready, you can now predict location from the text (300 words max) "
61 |           f"in a form of {'Gaussian distributions (lon, lat, cov)' if prob else 'coordinates (lon, lat)'}"
62 |           f" with {outcomes} possible prediction outcomes.\nNOTE\tOutcomes that have very low weight won't be displayed")
63 | 
64 |     text = args.text if args.text else input("Insert text: ")
65 |     while text != "exit":
66 |         if len(text) == 0:
67 |             text = text_example
68 |         if len(text.split()) < 300:
69 |             result = prediction.prediction_output(text, filtering=True, visual=False)
70 | 
71 |             if args.outcomes > 1:
72 |                 ind = np.argwhere(np.round(result.weights[0, :] * 100, 2) > 0)
73 |                 significant = result.means[0, ind].reshape(-1, 2)
74 |                 weights = result.weights[0, ind].flatten()
75 |             else:
76 |                 significant = result.means.reshape(-1, 2)
77 |                 weights = np.ones(1)
78 | 
79 |             sig_weights = np.round(weights * 100, 2)
80 |             sig_weights = sig_weights[sig_weights > 0]
81 | 
82 |             print(f"RESULT\t{len(sig_weights)} significant prediction outcome(s):")
83 | 
84 |             for i in range(len(sig_weights)):
85 |                 point = f"lon: {'  lat: '.join(map(str, significant[i]))}"
86 |                 print(f"\tOut {i + 1}\t{sig_weights[i]}%\t-\t{point}")
87 | 
88 |         else:
89 |             print(f"Number of words is above 300, unable to process.")
90 | 
91 |         text = args.text if args.text else input("Insert text: ")
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     main()
96 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | geopandas==0.12.2
 2 | geopy==2.3.0
 3 | GPUtil==1.4.0
 4 | imageio==2.25.1
 5 | matplotlib==3.7.0
 6 | moviepy==1.0.3
 7 | numpy==1.21.5
 8 | pandas==1.5.2
 9 | plotly==5.13.1
10 | psutil==5.9.4
11 | pyarrow==11.0.0
12 | scikit_learn==1.2.1
13 | scipy==1.10.1
14 | seaborn==0.12.2
15 | Shapely==2.0.1
16 | spacy==3.5.0
17 | torch==1.13.1
18 | torchvision==0.14.1
19 | torchaudio==0.13.1
20 | torchtext==0.14.1
21 | fastai==2.7.11
22 | tokenizers
23 | torchdata==0.5.1
24 | tqdm==4.64.1
25 | transformers==4.26.0
26 | Basemap==1.3.6
27 | basemap-data-hires==1.3.2
28 | 


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png


--------------------------------------------------------------------------------
/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png


--------------------------------------------------------------------------------
/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png


--------------------------------------------------------------------------------
/results/img/gmm_contour_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_contour_plot.png


--------------------------------------------------------------------------------
/results/img/gmm_likelihood_world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_likelihood_world.png


--------------------------------------------------------------------------------
/results/img/gmm_user_summary_S300_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_user_summary_S300_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png


--------------------------------------------------------------------------------
/results/img/gmm_user_summary_S38_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N218_VF-NON-GEO_2022-11-24_N218_2022-11-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_user_summary_S38_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N218_VF-NON-GEO_2022-11-24_N218_2022-11-24.png


--------------------------------------------------------------------------------
/results/img/gmm_user_summary_S500_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_user_summary_S500_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png


--------------------------------------------------------------------------------
/results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png


--------------------------------------------------------------------------------
/results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png


--------------------------------------------------------------------------------
/results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png


--------------------------------------------------------------------------------
/results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png


--------------------------------------------------------------------------------
/results/metric/EF-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-01-25.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	57.1224663336316
 3 | Median SAE          	3.630642382829759
 4 | MSE                 	21.350589606782105
 5 | MAE                 	0.8632753163412875
 6 | Acc@100             	94.73133333333334
 7 | Acc@161             	96.005
 8 | Average CAE         	97.14063901745017
 9 | Median CAE          	47.03875416171589
10 | Average 95% PRA     	5.7877121133727405
11 | Median 95% PRA      	2.9956897498650576
12 | PRA COVerage        	0.7470066666666667
13 | Outcome             	ALL 3
14 | Average SAE         	57.19130247060087
15 | Median SAE          	3.6816462055878008
16 | MSE                 	21.341014506627996
17 | MAE                 	0.4321043145344873
18 | Acc@100             	94.72033333333333
19 | Acc@161             	96.00333333333333
20 | Average CAE         	99.84322494836586
21 | Median CAE          	47.643151346524434
22 | Average 95% PRA     	8.826576905114056
23 | Median 95% PRA      	7.581258966300876
24 | PRA COVerage        	0.58095
25 | 


--------------------------------------------------------------------------------
/results/metric/EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-NON-GEO_2023-02-09.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1094.6138078166332
 3 | Median SAE          	769.7310232012596
 4 | MSE                 	141.878554113195
 5 | MAE                 	14.264336468087135
 6 | Acc@100             	0.4144736842105263
 7 | Acc@161             	0.8763157894736843
 8 | Average CAE         	1659.6462067284033
 9 | Median CAE          	1378.7820427028882
10 | Average 95% PRA     	1676.6693654767037
11 | Median 95% PRA      	1658.3572044843243
12 | PRA COVerage        	0.028342105263157894
13 | Outcome             	ALL 3
14 | Average SAE         	1094.6138086046483
15 | Median SAE          	769.7310234018382
16 | MSE                 	141.87855446328854
17 | MAE                 	7.132168239106902
18 | Acc@100             	0.4144736842105263
19 | Acc@161             	0.8763157894736843
20 | Average CAE         	1658.9157483435374
21 | Median CAE          	1377.346818732903
22 | Average 95% PRA     	177.55978655591733
23 | Median 95% PRA      	176.67033683902014
24 | PRA COVerage        	0.09657017543859649
25 | 


--------------------------------------------------------------------------------
/results/metric/EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-TEXT-ONLY_2023-02-09.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1215.7329909213397
 3 | Median SAE          	787.4934604866901
 4 | MSE                 	199.027752393994
 5 | MAE                 	15.800832527802372
 6 | Acc@100             	0.23947368421052634
 7 | Acc@161             	0.5342105263157895
 8 | Average CAE         	1773.335490256125
 9 | Median CAE          	1390.2649093982022
10 | Average 95% PRA     	1658.4397793658538
11 | Median 95% PRA      	1679.9652339034906
12 | PRA COVerage        	0.01938157894736842
13 | Outcome             	ALL 3
14 | Average SAE         	1216.029780347764
15 | Median SAE          	787.4934607697677
16 | MSE                 	199.3000028756908
17 | MAE                 	7.9021518988569746
18 | Acc@100             	0.23947368421052634
19 | Acc@161             	0.5342105263157895
20 | Average CAE         	1773.0230774157867
21 | Median CAE          	1389.7878439109427
22 | Average 95% PRA     	175.83709034876665
23 | Median 95% PRA      	177.81759959293902
24 | PRA COVerage        	0.09093421052631578
25 | 


--------------------------------------------------------------------------------
/results/metric/U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1266.1940182232079
 3 | Median SAE          	37.688264739505954
 4 | MSE                 	743.222800867309
 5 | MAE                 	17.220915844783917
 6 | Acc@100             	62.153000000000006
 7 | Acc@161             	67.02466666666666
 8 | Average CAE         	1292.5292902055553
 9 | Median CAE          	62.64815599155605
10 | Average 95% PRA     	27.22535086762321
11 | Median 95% PRA      	2.9983193329878755
12 | PRA COVerage        	0.21840666666666667
13 | Outcome             	ALL 5
14 | Average SAE         	1266.1951182548135
15 | Median SAE          	37.68981554393958
16 | MSE                 	743.2222789348478
17 | MAE                 	8.610462794339353
18 | Acc@100             	62.153666666666666
19 | Acc@161             	67.02466666666666
20 | Average CAE         	1292.554557406498
21 | Median CAE          	62.64599706100019
22 | Average 95% PRA     	14.736914482446378
23 | Median 95% PRA      	7.515493132740927
24 | PRA COVerage        	0.117392
25 | 


--------------------------------------------------------------------------------
/results/metric/U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-29.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	3203.720367762154
 3 | Median SAE          	585.8696742244236
 4 | MSE                 	2027.3966353043922
 5 | MAE                 	44.29867222675726
 6 | Acc@100             	38.46633333333334
 7 | Acc@161             	41.17966666666666
 8 | Average CAE         	3225.9160141364423
 9 | Median CAE          	623.3095886278236
10 | Average 95% PRA     	86.12589795786408
11 | Median 95% PRA      	3.8511226439992745
12 | PRA COVerage        	0.16536
13 | Outcome             	ALL 5
14 | Average SAE         	3203.720654187707
15 | Median SAE          	585.8698625325562
16 | MSE                 	2027.3946785780554
17 | MAE                 	22.149332638486232
18 | Acc@100             	38.46633333333334
19 | Acc@161             	41.17966666666666
20 | Average CAE         	3225.8762513200772
21 | Median CAE          	623.4343025445885
22 | Average 95% PRA     	29.854989140654062
23 | Median 95% PRA      	8.514526081273205
24 | PRA COVerage        	0.07474533333333333
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-22.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	568.2775034320997
 3 | Median SAE          	32.0780867576
 4 | MSE                 	334.60555609377246
 5 | MAE                 	7.819104565829121
 6 | Acc@100             	72.53866666666666
 7 | Acc@161             	78.28633333333333
 8 | Average CAE         	639.2131073260726
 9 | Median CAE          	70.40403356928135
10 | Average 95% PRA     	60.94092486527804
11 | Median 95% PRA      	3.705517191563791
12 | PRA COVerage        	0.19613666666666665
13 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2023-02-19_metric_N300000_VF-NON-GEO_2023-02-21.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	526.2267981509376
 3 | Median SAE          	37.7204090414
 4 | MSE                 	259.39274008786316
 5 | MAE                 	7.027323746037905
 6 | Acc@100             	69.76733333333334
 7 | Acc@161             	76.324
 8 | Average CAE         	598.4284945825225
 9 | Median CAE          	78.52907853415618
10 | Average 95% PRA     	62.67164017585331
11 | Median 95% PRA      	4.03528769698187
12 | PRA COVerage        	0.17678666666666668
13 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1881.226679489545
 3 | Median SAE          	152.99867793205
 4 | MSE                 	1118.1127814060806
 5 | MAE                 	25.450553855346023
 6 | Acc@100             	46.009
 7 | Acc@161             	50.49133333333333
 8 | Average CAE         	1953.9709490024165
 9 | Median CAE          	352.7690710078982
10 | Average 95% PRA     	174.00230971889167
11 | Median 95% PRA      	60.76411961255744
12 | PRA COVerage        	0.12542333333333333
13 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-24_metric_N300000_VF-NON-GEO_2022-11-24.txt:
--------------------------------------------------------------------------------
1 | Outcome             	BEST
2 | Average SAE         	559.6151196493181
3 | Median SAE          	36.613367614249995
4 | MSE                 	319.76210796374187
5 | MAE                 	7.719847568522422
6 | Acc@100             	72.44500000000001
7 | Acc@161             	78.39566666666667
8 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt:
--------------------------------------------------------------------------------
1 | Outcome             	BEST
2 | Average SAE         	1872.1827726693336
3 | Median SAE          	140.2990609901
4 | MSE                 	1150.9366438239665
5 | MAE                 	25.576461171268132
6 | Acc@100             	46.70166666666667
7 | Acc@161             	51.282666666666664
8 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	552.887252254477
 3 | Median SAE          	32.92428538962793
 4 | MSE                 	314.9492124902332
 5 | MAE                 	7.626378681627015
 6 | Acc@100             	73.48566666666667
 7 | Acc@161             	78.86800000000001
 8 | Average CAE         	588.9925675399514
 9 | Median CAE          	64.98670458833743
10 | Average 95% PRA     	36.763179226711436
11 | Median 95% PRA      	3.3166217318692706
12 | PRA COVerage        	0.08798
13 | Outcome             	ALL 10
14 | Average SAE         	553.176748588044
15 | Median SAE          	33.23239487280529
16 | MSE                 	315.00555681992574
17 | MAE                 	3.814997443082041
18 | Acc@100             	73.442
19 | Acc@161             	78.847
20 | Average CAE         	599.809585161102
21 | Median CAE          	77.92234001691176
22 | Average 95% PRA     	15.566235104908506
23 | Median 95% PRA      	9.425969628868724
24 | PRA COVerage        	0.110402
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1844.6686523197552
 3 | Median SAE          	142.68284884641122
 4 | MSE                 	1127.9185978737346
 5 | MAE                 	25.23800925753932
 6 | Acc@100             	46.693
 7 | Acc@161             	51.11
 8 | Average CAE         	1896.672505641246
 9 | Median CAE          	191.6308764982617
10 | Average 95% PRA     	173.8400432096454
11 | Median 95% PRA      	5.282659731027015
12 | PRA COVerage        	0.06211
13 | Outcome             	ALL 10
14 | Average SAE         	1844.9623918688806
15 | Median SAE          	143.21406110901034
16 | MSE                 	1127.9052539828947
17 | MAE                 	12.620134586598647
18 | Acc@100             	46.653666666666666
19 | Acc@161             	51.07633333333334
20 | Average CAE         	1901.3933567712004
21 | Median CAE          	214.22602320099966
22 | Average 95% PRA     	32.41118289375993
23 | Median 95% PRA      	13.948084191235818
24 | PRA COVerage        	0.07766333333333333
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-08.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	568.6000967458361
 3 | Median SAE          	27.83860357461102
 4 | MSE                 	315.35875458483565
 5 | MAE                 	7.766317411950378
 6 | Acc@100             	73.52233333333334
 7 | Acc@161             	78.60233333333333
 8 | Average CAE         	608.2586993647415
 9 | Median CAE          	61.20042899812903
10 | Average 95% PRA     	36.75226019373327
11 | Median 95% PRA      	3.1656213706865604
12 | PRA COVerage        	0.20137333333333332
13 | Outcome             	ALL 100
14 | Average SAE         	568.9086226979437
15 | Median SAE          	28.136057359410167
16 | MSE                 	315.3168704794486
17 | MAE                 	3.884708725501805
18 | Acc@100             	73.476
19 | Acc@161             	78.58766666666666
20 | Average CAE         	618.2874970100345
21 | Median CAE          	71.06793525191361
22 | Average 95% PRA     	15.226239472174859
23 | Median 95% PRA      	9.142879964102544
24 | PRA COVerage        	0.011941366666666666
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-29_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1881.1270131620297
 3 | Median SAE          	149.12695451305
 4 | MSE                 	1135.7307776956497
 5 | MAE                 	25.583227937114227
 6 | Acc@100             	46.605999999999995
 7 | Acc@161             	50.644
 8 | Average CAE         	1939.090633727154
 9 | Median CAE          	199.66859359431743
10 | Average 95% PRA     	193.15558184238012
11 | Median 95% PRA      	4.445754656123253
12 | PRA COVerage        	0.13863666666666666
13 | Outcome             	ALL 100
14 | Average SAE         	1882.4342208517496
15 | Median SAE          	149.79454355771463
16 | MSE                 	1137.260548701505
17 | MAE                 	12.797386544848854
18 | Acc@100             	46.55233333333334
19 | Acc@161             	50.61766666666667
20 | Average CAE         	1942.8327829215752
21 | Median CAE          	224.19472848188917
22 | Average 95% PRA     	31.19339623431975
23 | Median 95% PRA      	12.272656917861283
24 | PRA COVerage        	0.0085576
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	561.5762779166488
 3 | Median SAE          	29.453758479280637
 4 | MSE                 	311.4603138009121
 5 | MAE                 	7.639022741766334
 6 | Acc@100             	73.50066666666667
 7 | Acc@161             	78.673
 8 | Average CAE         	597.5751942328264
 9 | Median CAE          	60.94959868185593
10 | Average 95% PRA     	16.204564645261435
11 | Median 95% PRA      	3.096197964297197
12 | PRA COVerage        	0.12032666666666667
13 | Outcome             	ALL 3
14 | Average SAE         	561.6625539878369
15 | Median SAE          	29.544135384316714
16 | MSE                 	311.4622059291896
17 | MAE                 	3.820028644168509
18 | Acc@100             	73.48633333333333
19 | Acc@161             	78.669
20 | Average CAE         	601.0722925067477
21 | Median CAE          	63.27102175171326
22 | Average 95% PRA     	13.92078117550932
23 | Median 95% PRA      	7.925957260886589
24 | PRA COVerage        	0.31235
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1876.1481224976758
 3 | Median SAE          	134.89082598225573
 4 | MSE                 	1145.2134790067066
 5 | MAE                 	25.60224848819392
 6 | Acc@100             	47.33266666666667
 7 | Acc@161             	51.528666666666666
 8 | Average CAE         	1908.385586945694
 9 | Median CAE          	209.7002456012217
10 | Average 95% PRA     	45.47747936340816
11 | Median 95% PRA      	11.299158256227699
12 | PRA COVerage        	0.0764
13 | Outcome             	ALL 3
14 | Average SAE         	1876.2305436206989
15 | Median SAE          	134.93077903759564
16 | MSE                 	1145.2199094124885
17 | MAE                 	12.801477918607013
18 | Acc@100             	47.315000000000005
19 | Acc@161             	51.52366666666667
20 | Average CAE         	1911.8091583122218
21 | Median CAE          	219.9153649029191
22 | Average 95% PRA     	24.103429804960783
23 | Median 95% PRA      	16.185895312036987
24 | PRA COVerage        	0.22035
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	556.2925544063847
 3 | Median SAE          	36.54243725535712
 4 | MSE                 	314.97636029200527
 5 | MAE                 	7.659210079387742
 6 | Acc@100             	72.64399999999999
 7 | Acc@161             	78.51700000000001
 8 | Outcome             	ALL 3
 9 | Average SAE         	556.2938914782208
10 | Median SAE          	36.54188096160445
11 | MSE                 	314.9766550857815
12 | MAE                 	3.8296141109586066
13 | Acc@100             	72.64399999999999
14 | Acc@161             	78.51700000000001
15 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-19.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1859.6971406684952
 3 | Median SAE          	142.77144003340806
 4 | MSE                 	1141.0929485863185
 5 | MAE                 	25.480128561718153
 6 | Acc@100             	46.49333333333333
 7 | Acc@161             	51.11866666666667
 8 | Outcome             	ALL 3
 9 | Average SAE         	1859.696318863178
10 | Median SAE          	142.77144269260805
11 | MSE                 	1141.0931166967357
12 | MAE                 	12.740060355875377
13 | Acc@100             	46.49333333333333
14 | Acc@161             	51.117999999999995
15 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-05.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	626.5331734393261
 3 | Median SAE          	69.84155282542723
 4 | MSE                 	317.31179424625583
 5 | MAE                 	8.472252410206108
 6 | Acc@100             	59.728
 7 | Acc@161             	70.66799999999999
 8 | Average CAE         	689.7891627867381
 9 | Median CAE          	156.3155615968835
10 | Average 95% PRA     	37.68922225874048
11 | Median 95% PRA      	17.541909748063617
12 | PRA COVerage        	0.07858
13 | Outcome             	ALL 5
14 | Average SAE         	626.6558752911719
15 | Median SAE          	70.04934056289767
16 | MSE                 	317.2918178901608
17 | MAE                 	4.236919440100403
18 | Acc@100             	59.694
19 | Acc@161             	70.65766666666666
20 | Average CAE         	691.0206138642998
21 | Median CAE          	158.53082102124682
22 | Average 95% PRA     	21.285102514536906
23 | Median 95% PRA      	18.52029142785464
24 | PRA COVerage        	0.09724933333333333
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-05.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	2036.690820895306
 3 | Median SAE          	234.72389085167492
 4 | MSE                 	1191.569184271369
 5 | MAE                 	27.367099908307914
 6 | Acc@100             	38.20033333333333
 7 | Acc@161             	45.15166666666667
 8 | Average CAE         	2080.0969230715614
 9 | Median CAE          	356.8301489056366
10 | Average 95% PRA     	56.656404871128125
11 | Median 95% PRA      	43.787506290806675
12 | PRA COVerage        	0.04188666666666667
13 | Outcome             	ALL 5
14 | Average SAE         	2036.7343356781525
15 | Median SAE          	234.8195810981424
16 | MSE                 	1191.543656971664
17 | MAE                 	13.68385871031472
18 | Acc@100             	38.18266666666666
19 | Acc@161             	45.14666666666667
20 | Average CAE         	2080.868258573548
21 | Median CAE          	357.46473741475245
22 | Average 95% PRA     	27.878607591261453
23 | Median 95% PRA      	28.77262525130461
24 | PRA COVerage        	0.06187533333333333
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N100_VF-NON-GEO_2022-11-17.txt:
--------------------------------------------------------------------------------
 1 | Average SAE         	119.66019516154287
 2 | Median SAE          	17.810982552272733
 3 | MSE                 	16.20156415636118
 4 | MAE                 	0.7875519875345544
 5 | Acc@100             	92.0
 6 | Acc@161             	93.0
 7 | Average CAE         	166.41984894274432
 8 | Median CAE          	62.34761531265032
 9 | Average 95% PRA     	10.79242328166499
10 | Median 95% PRA      	8.531194412932471
11 | PRA COVerage        	0.3
12 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-20.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	551.0441079868583
 3 | Median SAE          	29.009580915784433
 4 | MSE                 	302.38283684822113
 5 | MAE                 	7.530359371708234
 6 | Acc@100             	73.82233333333333
 7 | Acc@161             	79.01599999999999
 8 | Average CAE         	587.5484498388176
 9 | Median CAE          	61.411216369677334
10 | Average 95% PRA     	25.972122482501153
11 | Median 95% PRA      	3.07923791039419
12 | PRA COVerage        	0.12674333333333335
13 | Outcome             	ALL 5
14 | Average SAE         	551.3804511944757
15 | Median SAE          	29.40786386084497
16 | MSE                 	302.45818164043794
17 | MAE                 	3.7675011234359808
18 | Acc@100             	73.77966666666667
19 | Acc@161             	78.98366666666666
20 | Average CAE         	600.3474117451983
21 | Median CAE          	79.1162587443271
22 | Average 95% PRA     	15.252935002360253
23 | Median 95% PRA      	9.642332670934938
24 | PRA COVerage        	0.23365133333333332
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-24.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	880.2045442181371
 3 | Median SAE          	24.089552026815696
 4 | MSE                 	1043.625869290649
 5 | MAE                 	14.203196909385907
 6 | Acc@100             	76.47033333333333
 7 | Acc@161             	80.278
 8 | Average CAE         	915.5020383759471
 9 | Median CAE          	58.556179799350836
10 | Average 95% PRA     	22.806675555306633
11 | Median 95% PRA      	3.051213891111107
12 | PRA COVerage        	0.15504
13 | Outcome             	ALL 5
14 | Average SAE         	880.4064974262934
15 | Median SAE          	24.553067239616468
16 | MSE                 	1043.5267443249566
17 | MAE                 	7.102857868789059
18 | Acc@100             	76.43100000000001
19 | Acc@161             	80.26666666666667
20 | Average CAE         	926.048857600671
21 | Median CAE          	72.42235829900109
22 | Average 95% PRA     	14.245569049022276
23 | Median 95% PRA      	9.008763377993043
24 | PRA COVerage        	0.23922133333333334
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-24.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1587.7103717657617
 3 | Median SAE          	49.43012531452397
 4 | MSE                 	1071.5542114028951
 5 | MAE                 	21.942927961935947
 6 | Acc@100             	57.03366666666667
 7 | Acc@161             	60.70166666666667
 8 | Average CAE         	1627.8312262807785
 9 | Median CAE          	77.75976669138876
10 | Average 95% PRA     	73.95988109756931
11 | Median 95% PRA      	3.238094487707059
12 | PRA COVerage        	0.12459666666666666
13 | Outcome             	ALL 5
14 | Average SAE         	1587.5032657173201
15 | Median SAE          	49.839323052728304
16 | MSE                 	1071.209588332399
17 | MAE                 	10.97015378954644
18 | Acc@100             	56.985
19 | Acc@161             	60.668666666666674
20 | Average CAE         	1635.872835341197
21 | Median CAE          	105.6462485079305
22 | Average 95% PRA     	23.616667714973637
23 | Median 95% PRA      	11.601910217762153
24 | PRA COVerage        	0.18414266666666668
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-NON-GEO_2023-02-24.txt:
--------------------------------------------------------------------------------
1 | Outcome             	BEST
2 | Average SAE         	567.3842990687558
3 | Median SAE          	26.210280141113937
4 | MSE                 	514.1601806344198
5 | MAE                 	8.524774971330553
6 | Acc@100             	77.74
7 | Acc@161             	82.12
8 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-TEXT-ONLY_2023-02-24.txt:
--------------------------------------------------------------------------------
1 | Outcome             	BEST
2 | Average SAE         	892.2345579521816
3 | Median SAE          	31.171950625491604
4 | MSE                 	606.8715538241867
5 | MAE                 	12.631638517530538
6 | Acc@100             	69.44
7 | Acc@161             	74.1
8 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100_VF-NON-GEO_2022-11-17_metric_N100_VF-NON-GEO_2022-11-22.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	118.96837800851797
 3 | Median SAE          	17.5024042712
 4 | MSE                 	16.143209358759403
 5 | MAE                 	1.5657629981299996
 6 | Acc@100             	93.0
 7 | Acc@161             	93.0
 8 | Average CAE         	151.89993443774924
 9 | Median CAE          	53.5580648326241
10 | Average 95% PRA     	5.035829398679516
11 | Median 95% PRA      	3.018119173570053
12 | PRA COVerage        	0.19
13 | Outcome             	ALL 5
14 | Average SAE         	119.66019497129344
15 | Median SAE          	17.810982220624744
16 | MSE                 	16.20156407447996
17 | MAE                 	0.7875519863563727
18 | Acc@100             	92.0
19 | Acc@161             	93.0
20 | Average CAE         	166.96709316134712
21 | Median CAE          	62.70323179974559
22 | Average 95% PRA     	10.792423280458388
23 | Median 95% PRA      	8.531194419025953
24 | PRA COVerage        	0.3
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-23.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	551.0441230576836
 3 | Median SAE          	29.00965624275
 4 | MSE                 	302.38285051052117
 5 | MAE                 	7.530359547384761
 6 | Acc@100             	73.82233333333333
 7 | Acc@161             	79.01599999999999
 8 | Average CAE         	587.602272535663
 9 | Median CAE          	61.428721501169356
10 | Average 95% PRA     	25.972120676419227
11 | Median 95% PRA      	3.0792380510887263
12 | PRA COVerage        	0.12673666666666666
13 | Outcome             	ALL 5
14 | Average SAE         	551.3804664537547
15 | Median SAE          	29.40809293191692
16 | MSE                 	302.45819563128003
17 | MAE                 	3.7675012125909304
18 | Acc@100             	73.77966666666667
19 | Acc@161             	78.98366666666666
20 | Average CAE         	600.4000066790437
21 | Median CAE          	79.12131175647453
22 | Average 95% PRA     	15.252933884700132
23 | Median 95% PRA      	9.642331701034896
24 | PRA COVerage        	0.233652
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-23.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1845.00937803803
 3 | Median SAE          	135.06693287939999
 4 | MSE                 	1102.7013834707896
 5 | MAE                 	25.130295035377046
 6 | Acc@100             	47.35433333333333
 7 | Acc@161             	51.53033333333333
 8 | Average CAE         	1887.652430082346
 9 | Median CAE          	187.72449180133725
10 | Average 95% PRA     	97.19770537206963
11 | Median 95% PRA      	3.6437647577106986
12 | PRA COVerage        	0.07906
13 | Outcome             	ALL 5
14 | Average SAE         	1845.0441404326284
15 | Median SAE          	135.87013273111606
16 | MSE                 	1102.7810905095876
17 | MAE                 	12.565707118988758
18 | Acc@100             	47.30466666666667
19 | Acc@161             	51.49433333333333
20 | Average CAE         	1896.2714259390013
21 | Median CAE          	214.6930851721048
22 | Average 95% PRA     	27.91029143301712
23 | Median 95% PRA      	15.158291563378132
24 | PRA COVerage        	0.158866
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-23.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	577.751999736082
 3 | Median SAE          	35.62144710244375
 4 | MSE                 	341.1656133959407
 5 | MAE                 	7.96456962776988
 6 | Acc@100             	72.74166666666667
 7 | Acc@161             	78.59766666666667
 8 | Outcome             	ALL 5
 9 | Average SAE         	577.8066991457856
10 | Median SAE          	35.62152013386192
11 | MSE                 	341.24395821142076
12 | MAE                 	3.9826415725575184
13 | Acc@100             	72.74166666666667
14 | Acc@161             	78.59766666666667
15 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-24.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1986.703707385929
 3 | Median SAE          	151.27159786246557
 4 | MSE                 	1209.6200475465141
 5 | MAE                 	26.87739020900269
 6 | Acc@100             	46.025666666666666
 7 | Acc@161             	50.599666666666664
 8 | Outcome             	ALL 5
 9 | Average SAE         	1986.5804809058
10 | Median SAE          	151.271602511181
11 | MSE                 	1209.4877847369746
12 | MAE                 	13.43808280982075
13 | Acc@100             	46.025666666666666
14 | Acc@161             	50.599666666666664
15 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-10-31_metric_N300000_VF-NON-GEO_2022-11-22.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	7812.566795185766
 3 | Median SAE          	7809.646722224799
 4 | MSE                 	3769.8216207682385
 5 | MAE                 	94.56699774904601
 6 | Acc@100             	0.11199999999999999
 7 | Acc@161             	0.48766666666666664
 8 | Outcome             	ALL 5
 9 | Average SAE         	7812.559883244672
10 | Median SAE          	7809.646722224799
11 | MSE                 	3769.814875589674
12 | MAE                 	47.283439786580686
13 | Acc@100             	0.11199999999999999
14 | Acc@161             	0.48766666666666664
15 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	7976.811124753898
 3 | Median SAE          	7782.067641114199
 4 | MSE                 	3896.4071088331716
 5 | MAE                 	96.63334555148514
 6 | Acc@100             	0.043
 7 | Acc@161             	0.198
 8 | Outcome             	ALL 5
 9 | Average SAE         	7976.642842875767
10 | Median SAE          	7781.7176385929
11 | MSE                 	3896.2099554995943
12 | MAE                 	48.31548389645203
13 | Acc@100             	0.043
14 | Acc@161             	0.198
15 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-09.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	556.596930407323
 3 | Median SAE          	29.012107235794964
 4 | MSE                 	310.7896756651852
 5 | MAE                 	7.625280180899602
 6 | Acc@100             	73.92433333333334
 7 | Acc@161             	78.97833333333332
 8 | Average CAE         	592.3338335828001
 9 | Median CAE          	60.74781081627437
10 | Average 95% PRA     	23.17149074608845
11 | Median 95% PRA      	3.0816176045918335
12 | PRA COVerage        	0.12573333333333334
13 | Outcome             	ALL 50
14 | Average SAE         	556.8729892447342
15 | Median SAE          	29.349274534387682
16 | MSE                 	310.8038665670043
17 | MAE                 	3.814452796575479
18 | Acc@100             	73.88366666666667
19 | Acc@161             	78.94566666666667
20 | Average CAE         	604.414241057673
21 | Median CAE          	77.04904126035898
22 | Average 95% PRA     	14.734713213583234
23 | Median 95% PRA      	9.532355912466208
24 | PRA COVerage        	0.020491866666666667
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-09.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1855.923764156897
 3 | Median SAE          	136.03652401294164
 4 | MSE                 	1117.6151224207629
 5 | MAE                 	25.326143908743465
 6 | Acc@100             	47.25366666666667
 7 | Acc@161             	51.409000000000006
 8 | Average CAE         	1898.1937180477119
 9 | Median CAE          	189.36825804431618
10 | Average 95% PRA     	99.38303170048808
11 | Median 95% PRA      	3.478817313221037
12 | PRA COVerage        	0.08412
13 | Outcome             	ALL 50
14 | Average SAE         	1855.9443449215396
15 | Median SAE          	136.6271873705624
16 | MSE                 	1117.4904022529818
17 | MAE                 	12.66304185335675
18 | Acc@100             	47.215
19 | Acc@161             	51.39833333333333
20 | Average CAE         	1904.9692293352764
21 | Median CAE          	214.7305412802197
22 | Average 95% PRA     	27.77954768432691
23 | Median 95% PRA      	14.017333622246433
24 | PRA COVerage        	0.012973666666666666
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	580.6090313522942
 3 | Median SAE          	46.41566130380352
 4 | MSE                 	312.193184142252
 5 | MAE                 	7.915783033433179
 6 | Acc@100             	69.607
 7 | Acc@161             	76.45566666666667
 8 | Average CAE         	613.9193562319141
 9 | Median CAE          	81.65887706130992
10 | Average 95% PRA     	21.852941664343778
11 | Median 95% PRA      	3.7498956132762036
12 | PRA COVerage        	0.07282666666666666
13 | Outcome             	ALL 5
14 | Average SAE         	580.970835247516
15 | Median SAE          	46.88263239257573
16 | MSE                 	312.16432163494824
17 | MAE                 	3.9602365100725545
18 | Acc@100             	69.53033333333335
19 | Acc@161             	76.40966666666667
20 | Average CAE         	635.19984691036
21 | Median CAE          	106.95600581436469
22 | Average 95% PRA     	15.877051629880834
23 | Median 95% PRA      	11.788060470218173
24 | PRA COVerage        	0.12605666666666668
25 | 


--------------------------------------------------------------------------------
/results/metric/U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-28.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1875.2932581156072
 3 | Median SAE          	172.9306858701335
 4 | MSE                 	1099.589240160285
 5 | MAE                 	25.47369988597128
 6 | Acc@100             	44.204
 7 | Acc@161             	49.30766666666667
 8 | Average CAE         	1911.9219339073627
 9 | Median CAE          	208.6942984449754
10 | Average 95% PRA     	95.75923460662027
11 | Median 95% PRA      	4.8870412499624925
12 | PRA COVerage        	0.04433
13 | Outcome             	ALL 5
14 | Average SAE         	1875.3638156446036
15 | Median SAE          	172.93952006940208
16 | MSE                 	1099.3982200402288
17 | MAE                 	12.738032951488462
18 | Acc@100             	44.167
19 | Acc@161             	49.288
20 | Average CAE         	1927.2933304174228
21 | Median CAE          	235.14731426613002
22 | Average 95% PRA     	24.574656301155283
23 | Median 95% PRA      	13.810513023648852
24 | PRA COVerage        	0.075412
25 | 


--------------------------------------------------------------------------------
/results/metric/U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	781.7706650795018
 3 | Median SAE          	86.86877967943047
 4 | MSE                 	407.88967359115975
 5 | MAE                 	10.617183446501544
 6 | Acc@100             	53.806
 7 | Acc@161             	64.97433333333333
 8 | Average CAE         	902.3277154947419
 9 | Median CAE          	234.14109002107455
10 | Average 95% PRA     	148.92441323319156
11 | Median 95% PRA      	24.70561775871207
12 | PRA COVerage        	0.13995
13 | Outcome             	ALL 5
14 | Average SAE         	782.0235068256861
15 | Median SAE          	87.21493077857644
16 | MSE                 	407.93450996375645
17 | MAE                 	5.310163434238611
18 | Acc@100             	53.727000000000004
19 | Acc@161             	64.93766666666667
20 | Average CAE         	913.1045627056881
21 | Median CAE          	267.13501018077693
22 | Average 95% PRA     	36.20489287325245
23 | Median 95% PRA      	26.438525828931986
24 | PRA COVerage        	0.118212
25 | 


--------------------------------------------------------------------------------
/results/metric/U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1547.0540351194697
 3 | Median SAE          	176.21210408917602
 4 | MSE                 	839.0551868683772
 5 | MAE                 	20.879623384049147
 6 | Acc@100             	41.24033333333333
 7 | Acc@161             	48.747
 8 | Average CAE         	1710.817416332019
 9 | Median CAE          	423.7717242604772
10 | Average 95% PRA     	546.0467202967111
11 | Median 95% PRA      	59.65755199413765
12 | PRA COVerage        	0.11468333333333333
13 | Outcome             	ALL 5
14 | Average SAE         	1547.3396256779183
15 | Median SAE          	176.48929304210884
16 | MSE                 	839.1979633426333
17 | MAE                 	10.440824292898615
18 | Acc@100             	41.199000000000005
19 | Acc@161             	48.708666666666666
20 | Average CAE         	1708.7545371274753
21 | Median CAE          	442.1363776181922
22 | Average 95% PRA     	58.81665073457992
23 | Median 95% PRA      	38.20872910249881
24 | PRA COVerage        	0.08802666666666667
25 | 


--------------------------------------------------------------------------------
/results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N0e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N50_VF-NON-GEO_2022-11-22.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	9914.668096263977
 3 | Median SAE          	9995.085508992233
 4 | MSE                 	5357.341812998586
 5 | MAE                 	119.35189060703888
 6 | Acc@100             	0.0
 7 | Acc@161             	0.0
 8 | Average CAE         	9914.562013061304
 9 | Median CAE          	10002.199218399091
10 | Average 95% PRA     	12.175786353180174
11 | Median 95% PRA      	11.89449912855745
12 | PRA COVerage        	0.0
13 | Outcome             	ALL 5
14 | Average SAE         	9926.753104973639
15 | Median SAE          	10001.339856228697
16 | MSE                 	5363.141463231329
17 | MAE                 	59.75125683217192
18 | Acc@100             	0.0
19 | Acc@161             	0.0
20 | Average CAE         	9927.272739097398
21 | Median CAE          	10007.489719519272
22 | Average 95% PRA     	16.772039178406793
23 | Median 95% PRA      	16.734159757402896
24 | PRA COVerage        	0.0
25 | 


--------------------------------------------------------------------------------
/results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-23.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	802.0893183793827
 3 | Median SAE          	24.974492747904417
 4 | MSE                 	151.58256803467805
 5 | MAE                 	9.968539887598322
 6 | Acc@100             	61.97266666666666
 7 | Acc@161             	63.995999999999995
 8 | Average CAE         	836.093719699225
 9 | Median CAE          	57.03076756833259
10 | Average 95% PRA     	49.5394874676072
11 | Median 95% PRA      	2.9997513563907243
12 | PRA COVerage        	0.30149333333333334
13 | Outcome             	ALL 5
14 | Average SAE         	802.0893183927425
15 | Median SAE          	24.974492747940083
16 | MSE                 	151.58256804297062
17 | MAE                 	4.984269943866374
18 | Acc@100             	61.97266666666666
19 | Acc@161             	63.995999999999995
20 | Average CAE         	836.0937366044021
21 | Median CAE          	57.07486283490132
22 | Average 95% PRA     	20.88662783493309
23 | Median 95% PRA      	7.513931392525943
24 | PRA COVerage        	0.16537466666666667
25 | 


--------------------------------------------------------------------------------
/results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-23.txt:
--------------------------------------------------------------------------------
 1 | Outcome             	BEST
 2 | Average SAE         	1162.604288374721
 3 | Median SAE          	598.8986795557007
 4 | MSE                 	198.96711438281176
 5 | MAE                 	14.63374204393779
 6 | Acc@100             	39.416333333333334
 7 | Acc@161             	41.291666666666664
 8 | Average CAE         	1204.8100220315987
 9 | Median CAE          	679.5672225740713
10 | Average 95% PRA     	104.09604027290496
11 | Median 95% PRA      	63.443590773093476
12 | PRA COVerage        	0.20825333333333335
13 | Outcome             	ALL 5
14 | Average SAE         	1162.604288516567
15 | Median SAE          	598.8986795584461
16 | MSE                 	198.96711445363616
17 | MAE                 	7.316871022877962
18 | Acc@100             	39.416333333333334
19 | Acc@161             	41.291666666666664
20 | Average CAE         	1204.9063324921221
21 | Median CAE          	678.7367829126739
22 | Average 95% PRA     	35.27105756637679
23 | Median 95% PRA      	34.555602828281465
24 | PRA COVerage        	0.11697533333333333
25 | 


--------------------------------------------------------------------------------
/results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-NON-GEO_2023-02-24.txt:
--------------------------------------------------------------------------------
1 | Outcome             	BEST
2 | Average SAE         	375.03136106819596
3 | Median SAE          	12.91331621363022
4 | MSE                 	92.0769672561113
5 | MAE                 	4.7583405970026575
6 | Acc@100             	80.63612722544508
7 | Acc@161             	83.19663932786557
8 | 


--------------------------------------------------------------------------------
/results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-TEXT-ONLY_2023-02-24.txt:
--------------------------------------------------------------------------------
1 | Outcome             	BEST
2 | Average SAE         	431.1931917816224
3 | Median SAE          	14.518444548017039
4 | MSE                 	98.51503430916212
5 | MAE                 	5.539108098993033
6 | Acc@100             	75.77515503100621
7 | Acc@161             	78.11562312462492
8 | 


--------------------------------------------------------------------------------
/runs/prob/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665082080.gpu127.3070880.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/prob/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665082080.gpu127.3070880.0


--------------------------------------------------------------------------------
/runs/prob/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665062385.gpu118.2632089.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/prob/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665062385.gpu118.2632089.0


--------------------------------------------------------------------------------
/runs/spat/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-09_logs/events.out.tfevents.1665313776.gpu113.3164054.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/spat/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-09_logs/events.out.tfevents.1665313776.gpu113.3164054.0


--------------------------------------------------------------------------------
/runs/spat/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-23_logs/events.out.tfevents.1666523343.gpu148.1960620.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/spat/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-23_logs/events.out.tfevents.1666523343.gpu148.1960620.0


--------------------------------------------------------------------------------
/supplementary_resources/article_draft.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/article_draft.pdf


--------------------------------------------------------------------------------
/supplementary_resources/img/loss-graph-prob.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/loss-graph-prob.png


--------------------------------------------------------------------------------
/supplementary_resources/img/loss-graph-spat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/loss-graph-spat.png


--------------------------------------------------------------------------------
/supplementary_resources/img/map-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/map-density.png


--------------------------------------------------------------------------------
/supplementary_resources/img/model-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/model-train.png


--------------------------------------------------------------------------------
/supplementary_resources/img/mop-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/mop-loss.png


--------------------------------------------------------------------------------
/supplementary_resources/img/prediction-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/prediction-example.png


--------------------------------------------------------------------------------
/supplementary_resources/img/sop-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/sop-loss.png


--------------------------------------------------------------------------------
/supplementary_resources/img/total-loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/total-loss.png


--------------------------------------------------------------------------------
/supplementary_resources/scripts/bash/collector.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH -p defaultp # partition (queue)
 3 | #SBATCH -N 1 # number of nodes
 4 | #SBATCH -c 4 # number of cpus (cores)
 5 | #SBATCH --mem 150G # memory pool for all cores # 32GB for folktables ablation
 6 | #SBATCH --time 0-1:59:00 # time (D-HH:MM:SS)
 7 | #SBATCH --job-name=clt-data
 8 | #SBATCH -o world.%A_%a.%N.out # STDOUT
 9 | #SBATCH -e world.%A_%a.%N.err # STDERR
10 | #SBATCH --requeue
11 | 
12 | SEED=${SLURM_ARRAY_TASK_ID:-0}
13 | SEEDSTR=$( printf "%01d" $SEED )
14 | 
15 | hostname
16 | date
17 | 
18 | module load python/3.8
19 | source ~/twitter-env/bin/activate
20 | 
21 | export OMP_NUM_THREADS=4
22 | 
23 | ARG=( "$@" )
24 | FILE="${ARG[0]}"
25 | 
26 | if [ -e ${FILE} ]
27 | then
28 |     echo ${FILE} exists
29 |     stat -L -c "%a %G %U" ${FILE}
30 |     cd ${HOME}/geo-twitter/
31 |     srun python -u data-collector.py ${ARG[*]}
32 | else
33 |     echo ${FILE} does not exist
34 | fi
35 | 
36 | date
37 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/bash/data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES20="/archive3/group/chlgrp/twitter-collection-2020/twitter-2020-*.txt"
 4 | FILES21="/archive3/group/chlgrp/twitter-collection-2021/twitter-2021-*.txt"
 5 | FILES22="/archive3/group/chlgrp/twitter-collection-2022/twitter-2022-*.txt"
 6 | 
 7 | FILES=( $FILES20 $FILES21 $FILES22 )
 8 | total=${#FILES[@]}
 9 | echo Total number of files is ${total}
10 | c=0
11 | 
12 | 
13 | for f in "${FILES[@]}"
14 | do
15 | 	if [ -e ${f} ]
16 | 	then
17 | 	    echo ${f} exists
18 | 	    stat -L -c "%a %G %U" ${f}
19 | 	    if [ ! -s ${HOME}/geo-twitter/datasets/world/${f:47:18}.txt ]
20 | 	    then
21 | 		echo filtered file is empty or does not exist
22 | 		sbatch collector.sh ${f}
23 | 		c=$((c+1))
24 | 	    else
25 | 		echo filtered dataset is already collected
26 | 	    fi
27 | 	else
28 | 	    echo ${f} does not exist
29 | 	fi
30 | done
31 | 
32 | 
33 | echo Total number of files is ${total}
34 | echo Total number of scripts launched is ${c}
35 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/bash/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --ntasks=1
 3 | #SBATCH --partition=gpu
 4 | #SBATCH --gres=gpu:1
 5 | #SBATCH --constraint=GTX1080Ti
 6 | #SBATCH --cpus-per-task=12
 7 | #SBATCH --export=NONE
 8 | #SBATCH --mem 100G # memory pool for all cores # 32GB for folktables ablation
 9 | #SBATCH --time 10-00:00:00 # time (D-HH:MM:SS)
10 | #SBATCH --job-name=world-model
11 | #SBATCH -o world-model.%A_%a.%N.out # STDOUT
12 | #SBATCH -e world-model.%A_%a.%N.err # STDERR
13 | #SBATCH --requeue
14 | 
15 | unset SLURM_EXPORT_ENV
16 | 
17 | SEED=${SLURM_ARRAY_TASK_ID:-0}
18 | SEEDSTR=$( printf "%01d" $SEED )
19 | 
20 | hostname
21 | date
22 | 
23 | module load python/3.8
24 | module load cuda
25 | source ${HOME}/twitter-env/bin/activate
26 | 
27 | export OMP_NUM_THREADS=12
28 | export CUDA_VISIBLE_DEVICE=0
29 | 
30 | cd ${HOME}/geo-twitter/
31 | 
32 | ARG=( "$@" )
33 | srun --cpu_bind=verbose python -u train_bert.py ${ARG[*]}
34 | 
35 | date
36 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/bert_train.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from transformers import BertTokenizer
  3 | from utils.model_trainer import *
  4 | 
  5 | # Entry point for training anf evaluation of the models
  6 | 
  7 | f = ["GEO", "NON-GEO", "META-DATA", "TEXT-ONLY", "GEO-ONLY", "USER-ONLY"]
  8 | 
  9 | dataset_file = 'worldwide-twitter-day-small.jsonl'  # .jsonl
 10 | features = [f[1], f[4]]
 11 | val_f = None  # None -> features[0]
 12 | target_columns = ["lon", "lat"]
 13 | 
 14 | original_worldwide_model = "bert-base-multilingual-cased"
 15 | original_usa_model = "bert-base-cased"
 16 | 
 17 | # parameters = dict(
 18 | #     max_lr = [5e-5, 1e-5],
 19 | #     min_lr = [5e-6, 1e-6, 1e-8, 1e-16],
 20 | #     scheduler = ["cosine", "plateau"]
 21 | # )
 22 | # param_values = [v for v in parameters.values()]
 23 | 
 24 | covariance_types = [None, "spher"]  # [None, "full", "spher", "diag", "tied"]
 25 | scheduler_types = ["cosine", "linear", "plateau"]  # ["cosine", "linear", "cosine-long", "plateau", "step", "multi step", "one cycle", "cyclic"]
 26 | 
 27 | loss_distance = True
 28 | loss_mf = "mean"  # mean/sum - mean if features > 1
 29 | loss_prob = "pos"  # all/pos - pos if prob
 30 | loss_total = "mean"  # sum/mean/type - mean if prob else type (spat)
 31 | 
 32 | outcomes = 5
 33 | covariance = covariance_types[1]  # None/spher
 34 | 
 35 | epochs = 3
 36 | log_step = 1000
 37 | 
 38 | batch_size = 4
 39 | 
 40 | lr_max = 1e-5
 41 | lr_min = 1e-6
 42 | scheduler = scheduler_types[0]
 43 | 
 44 | val_size = 1000  # samples/users if -vu
 45 | threshold = 100
 46 | 
 47 | train_size = 0
 48 | test_ratio = 0.1
 49 | seed = 42
 50 | 
 51 | 
 52 | def main():
 53 |     parser = argparse.ArgumentParser(description='Finetune multilingual transformer model')
 54 | 
 55 |     # common
 56 |     parser.add_argument('-n', '--n-epochs', type=int, default=epochs, help='Number of the training epochs')
 57 |     parser.add_argument('-b', '--batch_size', type=int, default=batch_size, help='Batch size (default: 12)')
 58 | 
 59 |     parser.add_argument('-m', '--local_model', type=str, default=None,
 60 |                         help='Filename prefix of local model (!NOTE must fit related args)')
 61 | 
 62 |     parser.add_argument('-bb', '--bert_base', type=str, default=None,
 63 |                         help="BERT base model (default: worldwide)")
 64 | 
 65 | 
 66 |     parser.add_argument('--train', action="store_true", help="Start finetuning")
 67 |     parser.add_argument('--eval', action="store_true", help="Start evaluation")
 68 |     parser.add_argument('--hptune', action="store_true", help="Start training with hyper parameters tuning")
 69 | 
 70 |     # dataset
 71 |     parser.add_argument('-d', '--dataset', type=str, default=dataset_file, help="Input dataset (in jsonl format)")
 72 |     parser.add_argument('-ss', '--skip', type=int, default=0, help='Number of dataset samples to skip')
 73 | 
 74 |     parser.add_argument('-f', '--features', default=features, nargs='+', help="Feature column names")
 75 |     parser.add_argument('-t', '--target-cols', default=target_columns, nargs='+', help="Target column names")
 76 | 
 77 |     parser.add_argument('-s', '--seed', type=int, default=seed, help='Random seed (default: 42)')
 78 | 
 79 | 
 80 |     parser.add_argument('-nnv', '--norm_numb', action="store_true",
 81 |                         help="Normalize labels' number values (default: False)")
 82 | 
 83 |     parser.add_argument('-cv', '--class_val', action="store_true",
 84 |                         help="Classification labels as values (default: False)")
 85 | 
 86 | 
 87 |     # --train
 88 |     parser.add_argument('--no-ckp', action="store_false", help='Saving model checkpoints during training (Default: True)')
 89 | 
 90 |     parser.add_argument('-ls', '--log_step', type=int, default=log_step, help='Log step (default: 1000)')
 91 | 
 92 |     parser.add_argument('-lr', '--learn_rate', type=float, default=lr_max,
 93 |                         help='Learning rate maximum to start from (default: 4e-5)')
 94 |     parser.add_argument('-lrm', '--learn_rate_min', type=float, default=lr_min,
 95 |                         help='Learning rate minimum to end on (default: 1e-8)')
 96 |     parser.add_argument('-sdl', '--scheduler', type=str, default=scheduler, help="Scheduler type (Default: 'cosine')")
 97 | 
 98 |     parser.add_argument('-ts', '--train_size', type=int, default=train_size, help='Training dataloader size')
 99 |     parser.add_argument('-tr', '--test_ratio', type=float, default=test_ratio, help='Training dataloader test data ratio (default: 0.1)')
100 | 
101 |     parser.add_argument('-lmf', '--loss_mf', type=str, default=loss_mf,
102 |                         help="Multi feature loss handle mean or sum (default: mean)")
103 | 
104 | 
105 |     # --eval
106 |     parser.add_argument('-v', '--val_size', type=int, default=val_size, help='Validation dataloader size')
107 | 
108 |     # geo specific
109 |     parser.add_argument('-o', '--outcomes', type=int, default=outcomes, help="Number of outcomes (lomg, lat) per tweet")
110 |     parser.add_argument('-ld', '--loss_dist', action="store_false", help="Distance loss criterion (default: True)")
111 | 
112 |     parser.add_argument('-lp', '--loss_prob', type=str, default=loss_prob,
113 |                         help="GMM neg LLH loss stays in domain 'all' or 'pos' (default: 'pos')")
114 |     parser.add_argument('-c', '--covariance', type=str, default=covariance,
115 |                         help="GMM covariance matrix type (Default: 'spher')")
116 |     parser.add_argument('-nw', '--not-weighted', action="store_false",
117 |                         help="GMM weights are not equal (default: True)")
118 | 
119 |     parser.add_argument('-lt', '--loss_total', type=str, default=loss_total,
120 |                         help="Total loss handle by model's 'type' or 'sum' of all loss values (default: 'type')")
121 | 
122 |     args = parser.parse_args()
123 | 
124 |     if args.local_model is None:
125 |         prefix = f"{'US-' if args.usa_model else ''}{'U-' if not args.scale_coord else ''}{'+'.join(args.features)}-O{args.outcomes}-{'d' if  args.loss_dist else 'c'}-" \
126 |                  f"total_{args.loss_total if args.covariance is not None else 'type'}-{'mf_' + args.loss_mf + '-' if len(args.features) > 1 else ''}" \
127 |                  f"{args.loss_prob + '_' if args.covariance is not None else ''}{args.covariance if args.covariance is not None else 'NP'}-" \
128 |                  f"{'weighted-' if args.weighted and args.outcomes > 1 else ''}N{args.train_size//100000}e5-" \
129 |                  f"B{args.batch_size}-E{args.nepochs}-{args.scheduler}-LR[{args.learn_rate};{args.learn_rate_min}]"
130 |     else:
131 |         prefix = args.local_model
132 | 
133 |     print(f"Model prefix:\t{prefix}")
134 |     if torch.cuda.is_available():
135 |         print(f"DEVICE\tAvailable GPU has {torch.cuda.device_count()} devices, using {torch.cuda.get_device_name(0)}")
136 |         print(f"DEVICE\tCPU has {torch.get_num_threads()} threads")
137 |     else:
138 |         print(f"DEVICE\tNo GPU available, using the CPU with {torch.get_num_threads()} threads instead.")
139 | 
140 |     original_model = original_usa_model if args.usa_model else original_worldwide_model
141 | 
142 |     dataloader = TwitterDataloader(args.dataset,
143 |                                    args.features,
144 |                                    target_columns,
145 |                                    BertTokenizer.from_pretrained(original_model),
146 |                                    args.seed,
147 |                                    args.scale_coord,
148 |                                    val_f)
149 | 
150 | # no settings run to save filtered by condition dataset copy
151 | #    dataloader.filter_dataset("code", None, ["CA", 'FR', 'GB'])
152 | 
153 |     trainer = ModelTrainer(prefix,
154 |                            dataloader,
155 |                            args.nepochs,
156 |                            args.batch_size,
157 |                            args.outcomes,
158 |                            args.covariance,
159 |                            args.weighted,
160 |                            args.loss_dist,
161 |                            args.loss_mf,
162 |                            args.loss_prob,
163 |                            args.loss_total,
164 |                            args.learn_rate,
165 |                            args.learn_rate_min,
166 |                            original_model)
167 | 
168 |     # if args.hptune:
169 |     #     trainer.hp_tuning(args.train_size,
170 |     #                       args.test_ratio,
171 |     #                       param_values,
172 |     #                       args.log_step)
173 | 
174 |     if args.train:
175 |         trainer.finetune(args.train_size,
176 |                          args.test_ratio,
177 |                          f"{prefix}.pth",
178 |                          args.nockp,
179 |                          args.log_step,
180 |                          args.scheduler,
181 |                          args.skip)
182 | 
183 |     if args.eval:
184 |         trainer.eval(args.val_size,
185 |                      args.threshold,
186 |                      args.val_size,
187 |                      args.val_user,
188 |                      args.train_size)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     main()
193 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/camambert-test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import re
  4 | 
  5 | from transformers import CamembertModel, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup
  6 | 
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.preprocessing import StandardScaler
  9 | 
 10 | import torch
 11 | import torch.nn as nn
 12 | from torch.utils.data import TensorDataset, DataLoader
 13 | from torch.nn.utils.clip_grad import clip_grad_norm
 14 | 
 15 | file = 'ua_dataset.jsonl'
 16 | model_name = "camembert-base"
 17 | 
 18 | output_model = 'reg_saved.pth'
 19 | 
 20 | epochs = 3
 21 | batch_size = 16
 22 | 
 23 | tokenizer = CamembertTokenizer.from_pretrained(model_name)
 24 | coord_scaler = StandardScaler()
 25 | 
 26 | # minimal settings test run
 27 | # original to this adaptation code: https://medium.com/@anthony.galtier/fine-tuning-bert-for-a-regression-task-is-a-description-enough-to-predict-a-propertys-list-price-cf97cd7cb98a
 28 | # author: Anthony Galtier
 29 | 
 30 | def filter_websites(text):
 31 |     pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*'
 32 |     text = re.sub(pattern, '', text)
 33 |     return text
 34 | 
 35 | 
 36 | def filter_long_descriptions(tokenizer, descriptions, max_len):
 37 |     indices = []
 38 |     lengths = tokenizer(descriptions, padding=False,
 39 |                      truncation=False, return_length=True)['length']
 40 |     for i in range(len(descriptions)):
 41 |         if lengths[i] <= max_len-2:
 42 |             indices.append(i)
 43 |     return indices
 44 | 
 45 | 
 46 | def create_dataloaders(inputs, masks, labels, batch_size):
 47 |     input_tensor = torch.tensor(inputs)
 48 |     mask_tensor = torch.tensor(masks)
 49 |     labels_tensor = torch.tensor(labels)
 50 |     dataset = TensorDataset(input_tensor, mask_tensor,
 51 |                             labels_tensor)
 52 |     dataloader = DataLoader(dataset, batch_size=batch_size,
 53 |                             shuffle=True)
 54 |     return dataloader
 55 | 
 56 | 
 57 | class BertRegressor(nn.Module):
 58 |     def __init__(self, drop_rate=0.2, freeze_camembert=False):
 59 |         super(BertRegressor, self).__init__()
 60 |         D_in, D_out = 768, 1
 61 | 
 62 |         self.bert = CamembertModel.from_pretrained(model_name, return_dict=True)
 63 |         self.regressor = nn.Sequential(
 64 |             nn.Dropout(drop_rate),
 65 |             nn.Linear(D_in, D_out))
 66 | 
 67 |     def forward(self, input_ids, attention_masks):
 68 | 
 69 |         outputs = self.bert(input_ids, attention_masks)
 70 |         class_label_output = outputs[1]
 71 |         outputs = self.regressor(class_label_output)
 72 |         return outputs
 73 | 
 74 | 
 75 | def train(model, optimizer, scheduler, loss_function, epochs,
 76 |           train_dataloader, device, clip_value=2):
 77 |     for epoch in range(epochs):
 78 |         print("Epoch:", epoch)
 79 |         print("-----")
 80 |         best_loss = 1e10
 81 |         model.train()
 82 |         for step, batch in enumerate(train_dataloader):
 83 |             print("Step:", step)
 84 |             batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)
 85 |             model.zero_grad()
 86 |             outputs = model(batch_inputs, batch_masks)
 87 |             loss = loss_function(outputs.squeeze(),
 88 |                              batch_labels.squeeze())
 89 |             loss.backward()
 90 |             clip_grad_norm(model.parameters(), clip_value)
 91 |             optimizer.step()
 92 |             scheduler.step()
 93 | 
 94 |     return model
 95 | 
 96 | 
 97 | def evaluate(model, loss_function, test_dataloader, device):
 98 |     model.eval()
 99 |     test_loss, test_r2 = [], []
100 |     for batch in test_dataloader:
101 |         batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch)
102 |         with torch.no_grad():
103 |             outputs = model(batch_inputs, batch_masks)
104 |         loss = loss_function(outputs, batch_labels)
105 |         test_loss.append(loss.item())
106 |         r2 = r2_score(outputs, batch_labels)
107 |         test_r2.append(r2.item())
108 |     return test_loss, test_r2
109 | 
110 | 
111 | def r2_score(outputs, labels):
112 |     labels_mean = torch.mean(labels)
113 |     ss_tot = torch.sum((labels - labels_mean) ** 2)
114 |     ss_res = torch.sum((labels - outputs) ** 2)
115 |     r2 = 1 - ss_res / ss_tot
116 |     return r2
117 | 
118 | 
119 | def predict(model, dataloader, device):
120 |     model.eval()
121 |     output = []
122 |     for batch in dataloader:
123 |         batch_inputs, batch_masks, _ = tuple(b.to(device) for b in batch)
124 |         with torch.no_grad():
125 |             res = model(batch_inputs,
126 |                             batch_masks)
127 |             #print(res)
128 |             #print(res.view(1,-1).tolist()[0])
129 |             output += res.view(1,-1).tolist()[0]
130 |             #print(output)
131 |     return output
132 | 
133 | 
134 | def pretraining(model):
135 |     model = train(model, optimizer, scheduler, loss_function, epochs,
136 |                   train_dataloader, device, clip_value=2)
137 | 
138 |     def save(model, optimizer):
139 |         torch.save({
140 |             'model_state_dict': model.state_dict(),
141 |             'optimizer_state_dict': optimizer.state_dict()
142 |         }, output_model)
143 | 
144 |     print(evaluate(model, loss_function, test_dataloader, device))
145 | 
146 |     save(model, optimizer)
147 | 
148 | 
149 | def evalueting(model):
150 |     checkpoint = torch.load(output_model, map_location='cpu')
151 |     model.load_state_dict(checkpoint['model_state_dict'])
152 |     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
153 | 
154 |     val_set = val_data
155 | 
156 |     encoded_val_corpus = tokenizer(text=val_set.clear_text.tolist(),
157 |                               add_special_tokens=True,
158 |                               padding='max_length',
159 |                               truncation='longest_first',
160 |                               max_length=300,
161 |                               return_attention_mask=True)
162 | 
163 |     val_input_ids = np.array(encoded_val_corpus['input_ids'])
164 |     val_attention_mask = np.array(encoded_val_corpus['attention_mask'])
165 |     val_labels = val_set.longitude.to_numpy().astype(np.float32)
166 |     val_labels = coord_scaler.transform(val_labels.reshape(-1, 1))
167 |     val_dataloader = create_dataloaders(val_input_ids,
168 |                              val_attention_mask, val_labels, batch_size)
169 | 
170 |     y_pred_scaled = predict(model, val_dataloader, device)
171 |     print(y_pred_scaled)
172 | 
173 |     y_test = val_set.longitude.to_numpy()
174 |     y_pred = coord_scaler.inverse_transform(np.asarray(y_pred_scaled, dtype=np.float32).reshape(-1, 1))
175 | 
176 |     print(y_pred)
177 | 
178 |     for i in range(len(y_test)):
179 |         print(y_test[i], y_pred[i][0])
180 | 
181 |     from sklearn.metrics import mean_absolute_error
182 |     from sklearn.metrics import median_absolute_error
183 |     from sklearn.metrics import mean_squared_error
184 |     from sklearn.metrics import mean_absolute_percentage_error
185 |     from sklearn.metrics import r2_score
186 | 
187 |     mae = mean_absolute_error(y_test, y_pred)
188 |     mdae = median_absolute_error(y_test, y_pred)
189 |     mse = mean_squared_error(y_test, y_pred)
190 |     mape = mean_absolute_percentage_error(y_test, y_pred)
191 |     #mdape = ((pd.Series(y_test) - pd.Series(y_pred)) / pd.Series(y_test)).abs().median()
192 |     r_squared = r2_score(y_test, y_pred)
193 | 
194 |     print(mae, mdae, mse, mape, r_squared)
195 | 
196 | 
197 | data = pd.read_json(path_or_buf=file, lines=True)
198 | print(data.head())
199 | print(data.info())
200 | 
201 | data['clear_text'] = data.texts.apply(filter_websites)
202 | 
203 | train_data = data.iloc[100:, :]
204 | val_data = data.iloc[:100, :]
205 | 
206 | df = train_data
207 | print(df.info())
208 | 
209 | encoded_corpus = tokenizer(text=df.clear_text.tolist(),
210 |                             add_special_tokens=True,
211 |                             padding='max_length',
212 |                             truncation='longest_first',
213 |                             max_length=300,
214 |                             return_attention_mask=True)
215 | 
216 | input_ids = encoded_corpus['input_ids']
217 | attention_mask = encoded_corpus['attention_mask']
218 | 
219 | short_descriptions = filter_long_descriptions(tokenizer, df.clear_text.tolist(), 300)
220 | input_ids = np.array(input_ids)[short_descriptions]
221 | attention_mask = np.array(attention_mask)[short_descriptions]
222 | labels = df.longitude.to_numpy()[short_descriptions].astype(np.float32)
223 | 
224 | test_size = 0.1
225 | seed = 42
226 | 
227 | train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids,
228 |                                                                         labels,
229 |                                                                         test_size=test_size,
230 |                                                                         random_state=seed)
231 | 
232 | train_masks, test_masks, _, _ = train_test_split(attention_mask,
233 |                                                  labels,
234 |                                                  test_size=test_size,
235 |                                                  random_state=seed)
236 | 
237 | 
238 | coord_scaler.fit(train_labels.reshape(-1, 1))
239 | 
240 | train_labels = coord_scaler.transform(train_labels.reshape(-1, 1))
241 | test_labels = coord_scaler.transform(test_labels.reshape(-1, 1))
242 | 
243 | train_dataloader = create_dataloaders(train_inputs, train_masks,
244 |                                       train_labels, batch_size)
245 | test_dataloader = create_dataloaders(test_inputs, test_masks,
246 |                                      test_labels, batch_size)
247 | 
248 | model = BertRegressor(drop_rate=0.2)
249 | 
250 | if torch.cuda.is_available():
251 |     device = torch.device("cuda")
252 |     print("Using GPU.")
253 | else:
254 |     print("No GPU available, using the CPU instead.")
255 |     device = torch.device("cpu")
256 | 
257 | model.to(device)
258 | 
259 | 
260 | optimizer = AdamW(model.parameters(),
261 |                   lr=5e-5,
262 |                   eps=1e-8)
263 | 
264 | 
265 | total_steps = len(train_dataloader) * epochs
266 | scheduler = get_linear_schedule_with_warmup(optimizer,
267 |                  num_warmup_steps=0, num_training_steps=total_steps)
268 | 
269 | loss_function = nn.MSELoss()
270 | 
271 | #pretraining(model)
272 | evalueting(model)
273 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/coords_plots.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import pandas as pd
  4 | import matplotlib.cm as cm
  5 | import scipy.stats as st
  6 | from mpl_toolkits.basemap import Basemap
  7 | from matplotlib.colors import LinearSegmentedColormap
  8 | from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
  9 | from matplotlib.patches import Ellipse
 10 | from matplotlib.colors import LogNorm
 11 | from sklearn.preprocessing import MinMaxScaler
 12 | 
 13 | 
 14 | def load_jsonl(filename):
 15 |     filename = f"datasets/{filename}"
 16 |     print(f"DATASET\tLOAD\tLoading dataset from {filename}")
 17 |     data = pd.read_json(path_or_buf=filename, lines=True)
 18 |     print(f"DATASET\tLOAD\tDataset of {len(data.index)} coords is loaded")
 19 |     return data
 20 | 
 21 | 
 22 | def save_df(data, filename):
 23 |     with open(filename, "w") as f:
 24 |         data.to_json(f, orient='records', lines=True)
 25 |     print(f"VAL\tSAVE\tEstimated data of {len(data.index)} coords is written to file: {filename}")
 26 | 
 27 | # coords = load_jsonl(filename)
 28 | #
 29 | # #print(coords)
 30 | # test = coords[::100]
 31 | # print(test)
 32 | #
 33 | # dens_u = sm.nonparametric.KDEMultivariate(data=[coords["longitude"], coords["latitude"]], var_type='cc')
 34 | # print(dens_u)
 35 | # print(dens_u.bw)
 36 | #
 37 | # test["density"] = dens_u.pdf(test)
 38 | # save_df(test, "datasets/test.jsonl")
 39 | # #print(dens_test)
 40 | #
 41 | # x = test["longitude"]
 42 | # y = test["latitude"]
 43 | # #plt.pcolormesh([x, y], dens_test, shading='auto')
 44 | # #plt.show()
 45 | 
 46 | 
 47 | # Kernel Density Estimation surface on map
 48 | def kde(coords):
 49 |     x, y = coords["longitude"], coords["latitude"]
 50 | 
 51 |     xmin, xmax = -180, 180
 52 |     ymin, ymax = -90, 90
 53 | 
 54 |     xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
 55 |     print(f"KDE for {len(xx)**2} points is calculating")
 56 | 
 57 |     positions = np.vstack([xx.ravel(), yy.ravel()])
 58 |     values = np.vstack([x, y])
 59 |     kernel = st.gaussian_kde(values)
 60 |     # f = np.reshape(kernel(positions).T, xx.shape)
 61 |     # kde = pd.DataFrame(f)
 62 |     # save_df(kde, f"datasets/{kde_results}")
 63 |     f = load_jsonl(kde_results)
 64 |     print(f)
 65 | 
 66 |     fig = plt.figure(figsize=(20, 15))
 67 |     ax = plt.axes(projection='3d')
 68 | 
 69 |     ncolors = 256
 70 |     color_array = plt.get_cmap('rainbow')(range(ncolors))
 71 |     color_array[:,-1] = np.linspace(0.2,1.0,ncolors)
 72 |     map_object = LinearSegmentedColormap.from_list(name='rainbow_alpha',colors=color_array)
 73 |     plt.register_cmap(cmap=map_object)
 74 | 
 75 |     surf = ax.plot_surface(xx, yy, f, rstride=1, cstride=1, cmap='rainbow_alpha', edgecolor='none')
 76 |     ax.contour(xx, yy, f, zdir='z', offset=0, cmap=cm.coolwarm)
 77 | 
 78 |     map = Basemap(fix_aspect=False)
 79 |     ax.add_collection3d(map.drawcoastlines(linewidth=0.25))
 80 |     ax.add_collection3d(map.drawcountries(linewidth=0.35))
 81 | 
 82 |     ax.set_yticks(range(-90, 90, 30))
 83 |     ax.set_xticks(range(-180, 180, 30))
 84 | 
 85 |     ax.set_box_aspect((4, 3, 1))
 86 |     ax.set_xlabel('longitude')
 87 |     ax.set_ylabel('latitude')
 88 |     ax.set_zlabel('PDF')
 89 |     ax.set_title('Surface plot of Gaussian 2D KDE for 200x200 points estimated from worldwide tweets 2022')
 90 |     fig.colorbar(surf, shrink=0.5, aspect=10, location='left') # add color bar indicating the PDF
 91 |     ax.view_init(20, -60)
 92 | 
 93 |     pic = f"results/img/kde_test_world.png"
 94 | 
 95 |     #fig.tight_layout()
 96 |     plt.savefig(pic, dpi=600)
 97 |     print(f"VAL\tSAVE\tPlot of {len(f.index)} samples is drawn to file: {pic}")
 98 |     plt.show()
 99 | 
100 | 
101 | # GMM clustering of point on the map
102 | def gmm(coords, peaks, seed):
103 |     X = coords.to_numpy(dtype=float)
104 |     xmin, xmax = -180., 180.
105 |     ymin, ymax = -90., 90.
106 |     print(X)
107 | 
108 |     fig, ax = plt.subplots(figsize=(20, 10))
109 |     ax.set_xlim(xmin, xmax)
110 |     ax.set_ylim(ymin, ymax)
111 |     map = Basemap()
112 |     map.drawcoastlines(linewidth=0.5, color="black")
113 |     map.drawcountries(linewidth=0.7, color="black")
114 |     map.drawparallels(np.arange(ymin, ymax, 30.))
115 |     map.drawmeridians(np.arange(xmin, xmax, 30.))
116 |     map.drawmapboundary(fill_color='azure')
117 |     map.fillcontinents(color='white', lake_color='azure')
118 | 
119 |     print(f"Calculating clusters of {X.shape[0]} points from GMM with {peaks} means and random seed {seed}")
120 |     gmm = GaussianMixture(n_components=peaks, covariance_type='full', random_state=seed).fit(X)
121 |     labels = gmm.predict(X)
122 |     probs = gmm.predict_proba(X)
123 |     size = probs.max(1)**2
124 | 
125 |     map.scatter(X[:, 0], X[:, 1], c=labels, s=size, cmap="turbo", zorder=5)
126 |     #plt.scatter(X[:, 0], X[:, 1], s=4, c="black")
127 | 
128 |     w_factor = 0.2 / gmm.weights_.max()
129 |     for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
130 |         U, s, Vt = np.linalg.svd(covar)
131 |         angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
132 |         width, height = 2 * np.sqrt(s)
133 |         for nsig in range(1, 4):
134 |             ax.add_patch(Ellipse(pos, nsig * width, nsig * height, angle,
135 |                                  alpha=w * w_factor, color="black"))
136 | 
137 |     plt.title(f'Scatter plot of coordinated')
138 |     plt.xlabel('Longitude')
139 |     plt.ylabel('Latitude')
140 |     plt.show()
141 | 
142 | 
143 | # BIC and AIC criterion for GMMs of different peaks number
144 | def gmm_crit(coords, start, end, step, seed):
145 |     X = coords.to_numpy(dtype=float)
146 | 
147 |     fig, ax = plt.subplots(figsize=(20, 10))
148 | 
149 |     models = []
150 |     n_components = []
151 |     for n in range(start, end, step):
152 |         print(f"Calculating model with {n} means")
153 |         n_components.append(n)
154 |         models.append(GaussianMixture(n_components=n,
155 |                                       covariance_type='full',
156 |                                       verbose=1,
157 |                                       random_state=seed).fit(X))
158 | 
159 |     plt.plot(n_components, [m.bic(X) for m in models], label='BIC')
160 |     plt.plot(n_components, [m.aic(X) for m in models], label='AIC')
161 |     plt.legend(loc='best')
162 |     plt.xlabel('n_components')
163 |     #plt.show()
164 | 
165 |     f = f"results/img/gmm.png"
166 |     plt.savefig(f)
167 | 
168 | 
169 | # write GMM to jsonl file
170 | def save_gmm(gmm, filename):
171 |     gmm_df = pd.DataFrame(columns=["weights", "means", "covariances", "precisions", "precisions_cholesky"])
172 |     gmm_df[["means", "covariances", "precisions", "precisions_cholesky"]] = gmm_df[["means", "covariances", "precisions", "precisions_cholesky"]].astype(object)
173 |     gmm_df["weights"] = gmm.weights_
174 | 
175 |     weights = gmm.weights_
176 |     print('2 dec - Estimated number of clusters: ' + str((np.round(weights, 2) > 0).sum()))
177 |     print('3 dec - Estimated number of clusters: ' + str((np.round(weights, 3) > 0).sum()))
178 |     print('4 dec - Estimated number of clusters: ' + str((np.round(weights, 4) > 0).sum()))
179 |     print('5 dec - Estimated number of clusters: ' + str((np.round(weights, 5) > 0).sum()))
180 | 
181 |     for i in range(len(gmm.covariances_)):
182 |         gmm_df.at[i, "means"] = np.array(gmm.means_[i])
183 |         gmm_df.at[i, "covariances"] = np.array(gmm.covariances_[i])
184 |         gmm_df.at[i, "precisions"] = np.array(gmm.precisions_[i])
185 |         gmm_df.at[i, "precisions_cholesky"] = np.array(gmm.precisions_cholesky_[i])
186 |     #print(gmm_df)
187 | 
188 |     with open(filename, "w") as f:
189 |         gmm_df.to_json(f, orient='records', lines=True)
190 |     print(f"PARAM\tSAVE\tParameters for GMM with {len(gmm_df.index)} means are written to file: {filename}")
191 | 
192 | 
193 | # read GMM from jsonl file
194 | def load_gmm(filename):
195 |     data = pd.read_json(path_or_buf=filename, lines=True)
196 |     #print(data)
197 |     print(f"PARAM\tLOAD\tParameters for GMM with {len(data.index)} means are loaded")
198 |     means, covs, prec, prec_chol = [], [], [], []
199 |     for i in range(len(data.index)):
200 |         means.append(np.array(data.at[i, "means"]))
201 |         covs.append(np.array(data.at[i, "covariances"]))
202 |         prec.append(np.array(data.at[i, "precisions"]))
203 |         prec_chol.append(np.array(data.at[i, "precisions_cholesky"]))
204 | 
205 |     print(f"MODEL\tINIT\tInitialization of GMM with {len(data.index)} means")
206 |     gmm = GaussianMixture(n_components=len(data.index), covariance_type='full', max_iter=1, verbose=0, random_state=seed)
207 |     gmm.weights_ = data["weights"].values
208 | 
209 |     weights = gmm.weights_
210 |     print('2 dec - Estimated number of clusters: ' + str((np.round(weights, 2) > 0).sum()))
211 |     print('3 dec - Estimated number of clusters: ' + str((np.round(weights, 3) > 0).sum()))
212 |     print('4 dec - Estimated number of clusters: ' + str((np.round(weights, 4) > 0).sum()))
213 |     print('5 dec - Estimated number of clusters: ' + str((np.round(weights, 5) > 0).sum()))
214 | 
215 |     gmm.means_ = np.array(means)
216 |     gmm.covariances_ = np.array(covs)
217 |     gmm.precisions_ = np.array(prec)
218 |     gmm.precisions_cholesky_ = np.array(prec_chol)
219 | 
220 |     print(f"MODEL\tSET\tParameters of GMM with {len(data.index)} means are set")
221 |     return gmm
222 | 
223 | 
224 | # GMM clustering of point on the map
225 | def plot_gmm(gmm, coords):
226 |     X = coords.to_numpy(dtype=float)
227 |     xmin, xmax = -180., 180.
228 |     ymin, ymax = -90., 90.
229 |     #print(X)
230 | 
231 |     fig, ax = plt.subplots(figsize=(20, 10))
232 |     ax.set_xlim(xmin, xmax)
233 |     ax.set_ylim(ymin, ymax)
234 |     map = Basemap()
235 |     map.drawcoastlines(linewidth=0.5, color="black")
236 |     map.drawcountries(linewidth=0.7, color="black")
237 |     map.drawparallels(np.arange(ymin, ymax, 30.))
238 |     map.drawmeridians(np.arange(xmin, xmax, 30.))
239 |     map.drawmapboundary(fill_color='azure')
240 |     map.fillcontinents(color='white', lake_color='azure')
241 | 
242 |     labels = gmm.predict(X)
243 |     probs = gmm.predict_proba(X)
244 |     size = probs.max(1)**2
245 | 
246 |     map.scatter(X[:, 0], X[:, 1], c=labels, s=size, cmap="turbo", zorder=5)
247 | 
248 |     #w_factor = 0.9 / gmm.weights_.max()
249 |     for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
250 |         U, s, Vt = np.linalg.svd(covar)
251 |         angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
252 |         width, height = 2 * np.sqrt(s)
253 |         for nsig in range(1, 4):
254 |             ax.add_patch(Ellipse(pos, nsig * width, nsig * height, angle,
255 |                                  color="black", alpha=0.05))
256 | 
257 |     plt.title(f'Scatter plot of coordinates')
258 |     plt.xlabel('Longitude')
259 |     plt.ylabel('Latitude')
260 |     plt.show()
261 | 
262 |     f = f"results/img/gmm_plot.png"
263 |     plt.savefig(f)
264 | 
265 | 
266 | # fitting GMM to coords data
267 | def calc_gmm(coords, peaks, seed, iter, gmm_filename):
268 |     X = coords.to_numpy(dtype=float)
269 |     print(f"Calculating GMM with {peaks} means for max {iter} iterations")
270 |     gmm = GaussianMixture(n_components=peaks,
271 |                        covariance_type='full',
272 |                        verbose=1,
273 |                        n_init=1,
274 |                        max_iter=iter,
275 |                        random_state=seed).fit(X)
276 |     save_gmm(gmm, gmm_filename)
277 | 
278 | 
279 | # fitting Bayesian GMM to coords data
280 | def calc_bgmm(coords, peaks, seed, iter, bgmm_filename):
281 |     X = coords.to_numpy(dtype=float)
282 |     print(f"Calculating BGMM with {peaks} means for max {iter} iterations")
283 |     bgmm = BayesianGaussianMixture(n_components=peaks,
284 |                                covariance_type='full',
285 |                                verbose=1,
286 |                                n_init=1,
287 |                                max_iter=iter,
288 |                                random_state=seed).fit(X)
289 |     save_gmm(bgmm, bgmm_filename)
290 | 
291 | 
292 | # generate grid by number of steps
293 | def map_grid(peaks, step=200):
294 |     xmin, xmax = -180, 180
295 |     ymin, ymax = -90, 90
296 |     x = np.linspace(xmin, xmax, step)
297 |     y = np.linspace(ymin, ymax, step)
298 |     x = np.concatenate((x, peaks[:, 0]), axis=0)
299 |     x = np.sort(x)
300 |     y = np.concatenate((y, peaks[:, 1]), axis=0)
301 |     y = np.sort(y)
302 |     xx, yy = np.meshgrid(x, y)
303 |     #xx, yy = np.meshgrid(peaks[:, 0], peaks[:, 1])
304 |     return xx, yy
305 | 
306 | 
307 | # GMM likelihood score surface plot on the map (shifted to min as 0)
308 | def gmm_likelihood(gmm):
309 |     xx, yy = map_grid(gmm.means_, 100)
310 |     XX = np.array([xx.ravel(), yy.ravel()]).T
311 | 
312 |     print(f"Calculating scores from GMM for {len(XX)} points")
313 |     Z = gmm.score_samples(XX)
314 |     zmin = np.min(Z)
315 |     zmax = np.max(Z)
316 |     print(f"Original\tMin: {zmin}\tMax: {zmax}")
317 |     Z = Z - np.min(Z)
318 |     zmin = np.min(Z)
319 |     zmax = np.max(Z)
320 |     print(f"Adjusted\tMin: {zmin}\tMax: {zmax}")
321 |     #Z = np.exp(Z)
322 |     Z = Z.reshape(xx.shape)
323 |     #print(Z)
324 | 
325 |     # P = gmm.predict_proba(XX)
326 |     # S = P.max(1)
327 |     # S = S.reshape(xx.shape)
328 |     # print(S)
329 |     # scaler = MinMaxScaler((0, 100))
330 |     # Z = scaler.fit_transform(Z)
331 |     # print(Z)
332 | 
333 |     fig = plt.figure(figsize=(20, 15))
334 |     ax = plt.axes(projection='3d')
335 | 
336 |     ncolors = 256
337 |     color_array = plt.get_cmap('rainbow')(range(ncolors))
338 |     color_array[:,-1] = np.linspace(0.2,1.0,ncolors)
339 |     map_object = LinearSegmentedColormap.from_list(name='rainbow_alpha',colors=color_array)
340 |     plt.register_cmap(cmap=map_object)
341 | 
342 |     surf = ax.plot_surface(xx, yy, Z, rstride=1, cstride=1, cmap='rainbow_alpha', edgecolor='none')
343 |     contour = ax.contour(xx, yy, Z, levels=np.linspace(zmin, zmax, 500), zdir='z', offset=0, cmap='rainbow_alpha')
344 | 
345 |     map = Basemap(fix_aspect=False)
346 |     ax.add_collection3d(map.drawcoastlines(linewidth=0.25))
347 |     ax.add_collection3d(map.drawcountries(linewidth=0.35))
348 | 
349 |     ax.set_yticks(range(-90, 90, 30))
350 |     ax.set_xticks(range(-180, 180, 30))
351 |     ax.set_zticks(range(int(zmin), int(zmax), 5))
352 | 
353 |     ax.set_box_aspect((4, 3, 1))
354 |     ax.set_xlabel('longitude')
355 |     ax.set_ylabel('latitude')
356 |     ax.set_zlabel('Log-likelihood')
357 |     ax.set_title(f'Surface plot of likelihood from GMM with {len(gmm.weights_)} peaks for {len(XX)} points estimated from worldwide tweets 2022')
358 |     fig.colorbar(surf, shrink=0.5, aspect=10, location='left') # add color bar indicating the likelihood
359 |     ax.view_init(30, -60)
360 | 
361 |     pic = f"results/img/gmm_likelihood_world.png"
362 | 
363 |     #fig.tight_layout()
364 |     plt.savefig(pic, dpi=600)
365 |     print(f"PLOT\tSAVE\tPlot of GMM likelihood for {len(XX)} points is drawn to file: {pic}")
366 |     plt.show()
367 | 
368 | 
369 | # GMM PDF surface plot on the map
370 | def gmm_density(gmm):
371 |     xmin, xmax = -180, 180
372 |     ymin, ymax = -90, 90
373 |     xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j]
374 |     XX = np.array([xx.ravel(), yy.ravel()]).T
375 | 
376 |     print(f"Calculating scores from GMM for {len(XX)} points")
377 |     Z = gmm.score_samples(XX)
378 |     zmin = np.min(Z)
379 |     zmax = np.max(Z)
380 |     print(f"Original\tMin: {zmin}\tMax: {zmax}")
381 |     Z = np.exp(Z)
382 |     zmin = np.min(Z)
383 |     zmax = np.max(Z)
384 |     print(f"Probability\tMin: {zmin}\tMax: {zmax}")
385 |     Z = Z.reshape(xx.shape)
386 |     #print(Z)
387 | 
388 |     fig = plt.figure(figsize=(20, 15))
389 |     ax = plt.axes(projection='3d')
390 | 
391 |     ncolors = 256
392 |     color_array = plt.get_cmap('rainbow')(range(ncolors))
393 |     color_array[:,-1] = np.linspace(0.2,1.0,ncolors)
394 |     map_object = LinearSegmentedColormap.from_list(name='rainbow_alpha',colors=color_array)
395 |     plt.register_cmap(cmap=map_object)
396 | 
397 |     surf = ax.plot_surface(xx, yy, Z, rstride=1, cstride=1, cmap='rainbow_alpha', edgecolor='none')
398 |     contour = ax.contour(xx, yy, Z, levels=np.linspace(zmin, zmax, 500), zdir='z', offset=0, cmap='rainbow_alpha')
399 | 
400 |     map = Basemap(fix_aspect=False)
401 |     ax.add_collection3d(map.drawcoastlines(linewidth=0.25))
402 |     ax.add_collection3d(map.drawcountries(linewidth=0.35))
403 | 
404 |     ax.set_yticks(range(-90, 90, 30))
405 |     ax.set_xticks(range(-180, 180, 30))
406 |     #ax.set_zticks(range(int(zmin), int(zmax)))
407 | 
408 |     ax.set_box_aspect((4, 3, 1))
409 |     ax.set_xlabel('longitude')
410 |     ax.set_ylabel('latitude')
411 |     ax.set_zlabel('Probability')
412 |     ax.set_title(f'Surface plot of probability from GMM with {len(gmm.weights_)} peaks for {len(XX)} points estimated from worldwide tweets 2022')
413 |     fig.colorbar(surf, shrink=0.5, aspect=10, location='left') # add color bar indicating the probability
414 |     ax.view_init(10, -60)
415 | 
416 |     pic = f"results/img/gmm_probability_world.png"
417 | 
418 |     #fig.tight_layout()
419 |     plt.savefig(pic, dpi=600)
420 |     print(f"PLOT\tSAVE\tPlot of GMM probability for {len(XX)} points is drawn to file: {pic}")
421 |     plt.show()
422 | 
423 | 
424 | # GMM PDF contour plot on the map
425 | def gmm_contour(gmm):
426 |     xmin, xmax = -180, 180
427 |     ymin, ymax = -90, 90
428 |     xx, yy = map_grid(gmm.means_, 200)
429 |     XX = np.array([xx.ravel(), yy.ravel()]).T
430 | 
431 |     print(f"Calculating scores from GMM for {len(XX)} points")
432 |     Z = gmm.score_samples(XX)
433 |     zmin = np.min(Z)
434 |     zmax = np.max(Z)
435 |     print(f"Original\tMin: {zmin}\tMax: {zmax}")
436 |     Z = Z.reshape(xx.shape)
437 | 
438 |     fig, ax = plt.subplots(figsize=(20, 10))
439 |     ax.set_xlim(xmin, xmax)
440 |     ax.set_ylim(ymin, ymax)
441 | 
442 |     contour = ax.contour(xx, yy, Z, levels=np.linspace(zmin, zmax, 300), cmap='RdYlGn_r', linewidths=0.5)
443 | 
444 |     map = Basemap()
445 |     map.drawcoastlines(linewidth=0.5, color="black")
446 |     map.drawcountries(linewidth=0.7, color="black")
447 |     map.drawparallels(np.arange(ymin, ymax, 30.))
448 |     map.drawmeridians(np.arange(xmin, xmax, 30.))
449 |     map.drawmapboundary(fill_color='azure')
450 |     map.fillcontinents(color='white', lake_color='azure')
451 | 
452 |     peaks = gmm.means_
453 |     map.scatter(peaks[:, 0], peaks[:, 1], c=gmm.weights_, cmap="RdYlGn_r", s=0.7, zorder=5)
454 | 
455 |     plt.title(f'Contour plot of likelihood from GMM with {len(gmm.weights_)} peaks for {len(XX)} points estimated from worldwide tweets 2022')
456 |     fig.colorbar(contour, shrink=0.5, aspect=10, location='left') # add color bar indicating the likelihood
457 |     plt.xlabel('Longitude')
458 |     plt.ylabel('Latitude')
459 | 
460 |     pic = f"results/img/gmm_contour_plot.png"
461 |     plt.savefig(pic, dpi=600)
462 |     print(f"PLOT\tSAVE\tPlot of GMM probability for {len(XX)} points is drawn to file: {pic}")
463 | 
464 |     plt.show()
465 | 
466 | 
467 | filename = "twitter-2020-02-28.txt"
468 | world = "map-world/world-twitter-2022-coords.jsonl"
469 | kde_results = "kde_world_2022.jsonl"
470 | 
471 | coords = load_jsonl(filename)
472 | print(coords)
473 | 
474 | peaks = 1000
475 | seed = 42
476 | iter = 1000
477 | 
478 | gmm_filename = f"datasets/gmm-p{peaks}-c{len(coords.index)}.jsonl"
479 | bgmm_filename = f"datasets/bgmm-p{peaks}-c{len(coords.index)}.jsonl"
480 | gmm_200 = "datasets/200-gmm.jsonl"
481 | bgmm_cluser = "datasets/bgmm-p200-c12057022.jsonl"
482 | 
483 | #calc_gmm(coords, peaks, seed, iter, gmm_filename)
484 | #calc_bgmm(coords, peaks, seed, iter, bgmm_filename)
485 | 
486 | # gmm = load_gmm(bgmm_cluser)
487 | X = coords[["longitude", "latitude"]].to_numpy(dtype=float)
488 | # Z = gmm.score_samples(X)
489 | # print(Z.min(), Z.max(), Z.mean())
490 | # gmm = load_gmm(gmm_200)
491 | # Z = gmm.score_samples(X)
492 | # print(Z.min(), Z.max(), Z.mean())
493 | # gmm_likelihood(gmm)
494 | # gmm_density(gmm)
495 | # gmm_contour(gmm)
496 | 
497 | 
498 | 
499 | # test for differences in covariance
500 | #
501 | # from scipy.linalg import cholesky
502 | #
503 | # cov = "spherical"
504 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X)
505 | # print(cov)
506 | # print(gmm.covariances_)
507 | # print(gmm.precisions_)
508 | # print(gmm.precisions_cholesky_)
509 | # print(cholesky(gmm.covariances_))
510 | #
511 | # cov = "diag"
512 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X)
513 | # print(cov)
514 | # print(gmm.covariances_)
515 | # print(gmm.precisions_)
516 | # print(gmm.precisions_cholesky_)
517 | #
518 | # cov = "full"
519 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X)
520 | # print(cov)
521 | # print(gmm.covariances_)
522 | # print(gmm.precisions_)
523 | # print(gmm.precisions_cholesky_)
524 | #
525 | # cov = "tied"
526 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X)
527 | # print(cov)
528 | # print(gmm.covariances_)
529 | # print(gmm.precisions_)
530 | # print(gmm.precisions_cholesky_)
531 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/data-from-test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import re
  3 | import string
  4 | from sklearn.model_selection import train_test_split
  5 | # df = pd.read_csv("datasets/full_text.txt",
  6 | #                  delimiter="\t",
  7 | #                  header=None,
  8 | #                  encoding='latin-1')
  9 | # df.columns = ["user", "time", "x", "lat", "lon", "text"]
 10 | # print(len(df['user'].unique()))
 11 | # print(df.info())
 12 | # tdf = df.sample(n=300000, random_state=42)
 13 | # print(len(tdf['user'].unique()))
 14 | # df = df.drop(df.sample(n=300000, random_state=42).index, axis=0)
 15 | # edf = df.sample(n=76000, random_state=42)
 16 | # print(df.info())
 17 | # print(len(edf['user'].unique()))
 18 | df = pd.read_json(path_or_buf="datasets/worldwide-twitter-day.jsonl", lines=True)
 19 | print(len(df['user'].unique()))
 20 | print(df.user.mode())
 21 | n = 100
 22 | users = df['user'].value_counts()[:n].index.tolist()
 23 | 
 24 | for u in users:
 25 |     print((df.user == u).sum(), u)
 26 | 
 27 | user_df = df[df['user'].isin(users)]
 28 | with open("datasets/worldwide-twitter-day-users.jsonl", "w") as f:
 29 |     user_df.to_json(f, orient='records', lines=True)
 30 | 
 31 | 
 32 | # users = [y for x, y in df.groupby('user')]
 33 | #
 34 | # train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)
 35 | # train_users, test_users = list(train_users), list(test_users)
 36 | # train_size, test_size = len(train_users), len(test_users)
 37 | #
 38 | # train_df = pd.concat(train_users)
 39 | # test_df = pd.concat(test_users)
 40 | # print(train_df.info())
 41 | # print(len(train_df['user'].unique()))
 42 | # print(test_df.info())
 43 | # print(len(test_df['user'].unique()))
 44 | #
 45 | 
 46 | # with open("datasets/eisenstein_test.jsonl", "w") as f:
 47 | #     test_df.to_json(f, orient='records', lines=True)
 48 | 
 49 | 
 50 | # print(len(df["user"].unique()))
 51 | # df_by_user = pd.DataFrame(columns=["lon", "lat", "text", "user"])
 52 | # df_by_user["user"] = df["user"].unique()
 53 | # print(df_by_user.info())
 54 | # print(df_by_user)
 55 | # for i in range(len(df_by_user.index)):
 56 | #     user = df_by_user.loc[df_by_user.index[i], "user"]
 57 | #     user_texts = df.loc[df['user'] == user]
 58 | #     df_by_user.loc[df_by_user.index[i], "text"] = '. '.join(user_texts["text"].astype(str))
 59 | #     df_by_user.loc[df_by_user.index[i], "lon"] = user_texts.loc[user_texts.index[0], "lon"]
 60 | #     df_by_user.loc[df_by_user.index[i], "lat"] = user_texts.loc[user_texts.index[0], "lat"]
 61 | #
 62 | # print(df)
 63 | #
 64 | 
 65 | 
 66 | # def chunks(lst, n):
 67 | #     """Yield successive n-sized chunks from lst."""
 68 | #     for i in range(0, len(lst), n):
 69 | #         yield lst[i:i + n]
 70 | #
 71 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user.jsonl", lines=True)
 72 | # print(df.info())
 73 | # print(df["text"])
 74 | # lengths = df["text"].str.len()
 75 | # print(lengths)
 76 | # count = df['text'].str.split().str.len()
 77 | # print(count)
 78 | #
 79 | # df_split = pd.DataFrame(columns=["lon", "lat", "text", "user"])
 80 | # for i in range(len(df.index)):
 81 | #     texts = df.loc[df.index[i], "text"]
 82 | #     words = len(texts.split())
 83 | #     tw = texts.split()
 84 | #     if words // 300 > 0:
 85 | #         n = words//300 + 1
 86 | #         size = words//n + 1
 87 | #         splitted = [tw[i:i + size] for i in range(0, len(tw), size)]
 88 | #         for k in range(len(splitted)):
 89 | #             df_short = df.iloc[i]
 90 | #             splitted[k] = " ".join(splitted[k])
 91 | #             df_short["text"] = splitted[k]
 92 | #             df_split = df_split.append(df_short, ignore_index=True)
 93 | #     else:
 94 | #         df_split = df_split.append(df.iloc[i], ignore_index=True)
 95 | #
 96 | #
 97 | # with open("datasets/eisenstein_user_test.jsonl", "w") as f:
 98 | #         df_split.to_json(f, orient='records', lines=True)
 99 | 
100 | # def nlp_filtering(text):
101 | #     def filter_punctuation(text):
102 | #         punctuationfree="".join([i for i in text if i not in string.punctuation])
103 | #         return punctuationfree
104 | #
105 | #     def filter_websites(text):
106 | #         #pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*'
107 | #         pattern = r'http\S+'
108 | #         text = re.sub(pattern, '', text)
109 | #         return text
110 | #
111 | #     text = filter_websites(text)
112 | #     text = filter_punctuation(text)
113 | #     return text
114 | #
115 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user_test.jsonl", lines=True)
116 | # print(df.info())
117 | # print(df["text"])
118 | # lengths = df["text"].str.len()
119 | # print(lengths)
120 | # count = df['text'].str.split().str.len()
121 | # print(count.max())
122 | #
123 | # df["text"] = df["text"].apply(nlp_filtering)
124 | #
125 | # count = df['text'].str.split().str.len()
126 | # print(count.max())
127 | 
128 | 
129 | # df = pd.read_json(path_or_buf="results/val-data/U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100000_2022-10-29.jsonl", lines=True)
130 | # df = df.drop(df.sample(n=99900, random_state=42).index, axis=0)
131 | # with open("results/val-data/pmop-test.jsonl", "w") as f:
132 | #     df.to_json(f, orient='records', lines=True)
133 | 
134 | # size = 300000
135 | # vs = 300000
136 | # df = pd.read_json(path_or_buf="datasets/worldwide-twitter-day.jsonl", lines=True)
137 | # print(df.info())
138 | # print(len(df['lang'].unique()))
139 | # print(len(df['code'].unique()))
140 | # print(len(df['user'].unique()))
141 | # tdf = df.sample(n=size, random_state=42)
142 | # print(len(tdf['lang'].unique()))
143 | # print(len(tdf['code'].unique()))
144 | # print(len(tdf['user'].unique()))
145 | # df = df.drop(df.sample(n=size, random_state=42).index, axis=0)
146 | # edf = df.sample(n=vs, random_state=42)
147 | # print(df.info())
148 | # print(len(edf['lang'].unique()))
149 | # print(len(edf['code'].unique()))
150 | # print(len(edf['user'].unique()))
151 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/geotext-dataframe.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import re
  3 | import string
  4 | from sklearn.model_selection import train_test_split
  5 | 
  6 | 
  7 | # GeoText data from Eisenstein work - forming train and test dataframes
  8 | 
  9 | # df = pd.read_csv("datasets/full_text.txt",
 10 | #                  delimiter="\t",
 11 | #                  header=None,
 12 | #                  encoding='latin-1')
 13 | # df.columns = ["user", "time", "x", "lat", "lon", "text"]
 14 | # print(len(df['user'].unique()))
 15 | # print(df.info())
 16 | # tdf = df.sample(n=300000, random_state=42)
 17 | # print(len(tdf['user'].unique()))
 18 | # df = df.drop(df.sample(n=300000, random_state=42).index, axis=0)
 19 | # edf = df.sample(n=76000, random_state=42)
 20 | # print(df.info())
 21 | # print(len(edf['user'].unique()))
 22 | 
 23 | df = pd.read_json(path_or_buf="datasets/eisenstein.jsonl", lines=True)
 24 | 
 25 | users = [y for x, y in df.groupby('user')]
 26 | 
 27 | train_users, test_users = train_test_split(users, test_size=0.2, random_state=42)
 28 | train_users, test_users = list(train_users), list(test_users)
 29 | train_size, test_size = len(train_users), len(test_users)
 30 | 
 31 | train_df = pd.concat(train_users)
 32 | test_df = pd.concat(test_users)
 33 | print(train_df.info())
 34 | print(len(train_df['user'].unique()))
 35 | print(test_df.info())
 36 | print(len(test_df['user'].unique()))
 37 | 
 38 | with open("datasets/eisenstein_train.jsonl", "w") as f:
 39 |     train_df.to_json(f, orient='records', lines=True)
 40 | with open("datasets/eisenstein_test.jsonl", "w") as f:
 41 |     test_df.to_json(f, orient='records', lines=True)
 42 | 
 43 | 
 44 | # print(len(df["user"].unique()))
 45 | # df_by_user = pd.DataFrame(columns=["lon", "lat", "text", "user"])
 46 | # df_by_user["user"] = df["user"].unique()
 47 | # print(df_by_user.info())
 48 | # print(df_by_user)
 49 | # for i in range(len(df_by_user.index)):
 50 | #     user = df_by_user.loc[df_by_user.index[i], "user"]
 51 | #     user_texts = df.loc[df['user'] == user]
 52 | #     df_by_user.loc[df_by_user.index[i], "text"] = '. '.join(user_texts["text"].astype(str))
 53 | #     df_by_user.loc[df_by_user.index[i], "lon"] = user_texts.loc[user_texts.index[0], "lon"]
 54 | #     df_by_user.loc[df_by_user.index[i], "lat"] = user_texts.loc[user_texts.index[0], "lat"]
 55 | #
 56 | # print(df)
 57 | #
 58 | 
 59 | 
 60 | # def chunks(lst, n):
 61 | #     """Yield successive n-sized chunks from lst."""
 62 | #     for i in range(0, len(lst), n):
 63 | #         yield lst[i:i + n]
 64 | #
 65 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user.jsonl", lines=True)
 66 | # print(df.info())
 67 | # print(df["text"])
 68 | # lengths = df["text"].str.len()
 69 | # print(lengths)
 70 | # count = df['text'].str.split().str.len()
 71 | # print(count)
 72 | #
 73 | # df_split = pd.DataFrame(columns=["lon", "lat", "text", "user"])
 74 | # for i in range(len(df.index)):
 75 | #     texts = df.loc[df.index[i], "text"]
 76 | #     words = len(texts.split())
 77 | #     tw = texts.split()
 78 | #     if words // 300 > 0:
 79 | #         n = words//300 + 1
 80 | #         size = words//n + 1
 81 | #         splitted = [tw[i:i + size] for i in range(0, len(tw), size)]
 82 | #         for k in range(len(splitted)):
 83 | #             df_short = df.iloc[i]
 84 | #             splitted[k] = " ".join(splitted[k])
 85 | #             df_short["text"] = splitted[k]
 86 | #             df_split = df_split.append(df_short, ignore_index=True)
 87 | #     else:
 88 | #         df_split = df_split.append(df.iloc[i], ignore_index=True)
 89 | #
 90 | #
 91 | # with open("datasets/eisenstein_user_test.jsonl", "w") as f:
 92 | #         df_split.to_json(f, orient='records', lines=True)
 93 | 
 94 | # def nlp_filtering(text):
 95 | #     def filter_punctuation(text):
 96 | #         punctuationfree="".join([i for i in text if i not in string.punctuation])
 97 | #         return punctuationfree
 98 | #
 99 | #     def filter_websites(text):
100 | #         #pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*'
101 | #         pattern = r'http\S+'
102 | #         text = re.sub(pattern, '', text)
103 | #         return text
104 | #
105 | #     text = filter_websites(text)
106 | #     text = filter_punctuation(text)
107 | #     return text
108 | #
109 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user_test.jsonl", lines=True)
110 | # print(df.info())
111 | # print(df["text"])
112 | # lengths = df["text"].str.len()
113 | # print(lengths)
114 | # count = df['text'].str.split().str.len()
115 | # print(count.max())
116 | #
117 | # df["text"] = df["text"].apply(nlp_filtering)
118 | #
119 | # count = df['text'].str.split().str.len()
120 | # print(count.max())
121 | 
122 | 
123 | # df = pd.read_json(path_or_buf="results/val-data/U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100000_2022-10-29.jsonl", lines=True)
124 | # df = df.drop(df.sample(n=99900, random_state=42).index, axis=0)
125 | # with open("results/val-data/pmop-test.jsonl", "w") as f:
126 | #     df.to_json(f, orient='records', lines=True)
127 | 
128 | # size = 300000
129 | # vs = 300000
130 | # df = pd.read_json(path_or_buf="datasets/worldwide-twitter-day.jsonl", lines=True)
131 | # print(df.info())
132 | # print(len(df['lang'].unique()))
133 | # print(len(df['code'].unique()))
134 | # print(len(df['user'].unique()))
135 | # tdf = df.sample(n=size, random_state=42)
136 | # print(len(tdf['lang'].unique()))
137 | # print(len(tdf['code'].unique()))
138 | # print(len(tdf['user'].unique()))
139 | # df = df.drop(df.sample(n=size, random_state=42).index, axis=0)
140 | # edf = df.sample(n=vs, random_state=42)
141 | # print(df.info())
142 | # print(len(edf['lang'].unique()))
143 | # print(len(edf['code'].unique()))
144 | # print(len(edf['user'].unique()))
145 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/hf_repo.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from utils.regressor import *
 3 | from utils.result_manager import *
 4 | from transformers import Pipeline
 5 | from transformers import BertConfig, BertTokenizer
 6 | 
 7 | base_model = "bert-base-multilingual-cased"
 8 | 
 9 | local_model = "P-NON-GEO+GEO-ONLY-O5.pth"
10 | 
11 | hub_model = 'k4tel/geo-bert-multilingual'
12 | hf_folder = "models/hf/model"
13 | 
14 | local_model = f'models/final/{local_model}'
15 | 
16 | 
17 | # upload local model and save for the future HF repo upload
18 | def save(local_model, hf_folder, base_model):
19 |     config = BertConfig.from_pretrained(base_model)
20 | 
21 |     feature_outputs = BERTregModel(5, "spher", True, ["NON-GEO", "GEO-ONLY"], base_model).feature_outputs
22 | 
23 |     custom_model = GeoBertModel(config=config, feature_outputs=feature_outputs)
24 | 
25 |     if torch.cuda.is_available():
26 |         state = torch.load(local_model)
27 |     else:
28 |         state = torch.load(local_model, map_location='cpu')
29 | 
30 |     model_state_dict = state['model_state_dict']
31 | 
32 |     custom_model.load_state_dict(model_state_dict)
33 | 
34 |     tokenizer = BertTokenizer.from_pretrained(base_model)
35 | 
36 |     custom_model.save_pretrained(hf_folder)
37 |     tokenizer.save_pretrained(hf_folder)
38 | 
39 |     # add all files from hf_folder to the HF repo manually
40 | 
41 | 
42 | # huggingface framework load from repo + prediction pipeline test
43 | def load(hub_model, base_model):
44 |     model_wrapper = BERTregModel(5, "spher", True, ["NON-GEO", "GEO-ONLY"], base_model, hub_model)
45 |     benchmark = ModelBenchmark(model_wrapper, True, "pos", "mean", "mean")
46 | 
47 |     tokenizer = BertTokenizer.from_pretrained(hub_model)
48 |     model = model_wrapper.model
49 | 
50 |     # testing model
51 |     text = "CIA and FBI can track anyone, and you willingly give the data away"
52 |     inputs = tokenizer(text, return_tensors="pt")
53 | 
54 |     with torch.no_grad():
55 |         outputs = model(**inputs)
56 |         prob_model = benchmark.prob_models(outputs)
57 | 
58 |     print(f"RESULT\tPost-processing raw model outputs: {outputs}")
59 |     result = ResultManager(None, text, "NON-GEO", torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), benchmark, False, False, hub_model)
60 |     result.soft_outputs(list([prob_model]))
61 | 
62 |     ind = np.argwhere(np.round(result.weights[0, :] * 100, 2) > 0)
63 |     significant = result.means[0, ind].reshape(-1, 2)
64 |     weights = result.weights[0, ind].flatten()
65 | 
66 |     sig_weights = np.round(weights * 100, 2)
67 |     sig_weights = sig_weights[sig_weights > 0]
68 | 
69 |     print(f"RESULT\t{len(sig_weights)} significant prediction outcome(s):")
70 | 
71 |     for i in range(len(sig_weights)):
72 |         point = f"lon: {'  lat: '.join(map(str, significant[i]))}"
73 |         print(f"\tOut {i + 1}\t{sig_weights[i]}%\t-\t{point}")
74 | 
75 | 
76 | # save(local_model, hf_folder, base_model)
77 | # load(hub_model, base_model)
78 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/json_split.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | # specify the input and output file names
 5 | folder = "datasets/"
 6 | input_file = 'worldwide-twitter-2021.jsonl'
 7 | output_file_prefix = f'{folder}worldwide-twitter-2021_'
 8 | max_lines_per_file = 20000000
 9 | 
10 | # read the input file and split the data into multiple files
11 | with open(f"{folder}{input_file}", 'r') as f:
12 |     line_count = 0
13 |     file_count = 0
14 |     out_f = None
15 |     for line in f:
16 |         if line_count % max_lines_per_file == 0:
17 |             # create a new output file
18 |             if out_f:
19 |                 out_f.close()
20 |             output_file_name = f'{output_file_prefix}{file_count}.jsonl'
21 |             out_f = open(output_file_name, 'w')
22 |             file_count += 1
23 |         json_obj = json.loads(line.strip())
24 |         out_f.write(f'{line.strip()}\n')
25 |         line_count += 1
26 |     if out_f:
27 |         out_f.close()
28 | 
29 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/loss_graph_prob.py:
--------------------------------------------------------------------------------
 1 | import tkinter
 2 | import matplotlib
 3 | matplotlib.use('TkAgg')
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | from mpl_toolkits.mplot3d import Axes3D
 8 | from mpl_toolkits import mplot3d
 9 | 
10 | def loss(D, sigma):
11 |     numerator = np.exp(-D**2 / (2 * sigma))
12 |     denominator = 2 * np.pi * sigma
13 |     return -np.log(numerator / denominator)
14 | 
15 | 
16 | num = 100
17 | D_vals = np.linspace(0, 1, num)
18 | sigma_vals = np.linspace(1 / (2 * np.pi), 1, num)
19 | D_grid, sigma_grid = np.meshgrid(D_vals, sigma_vals)
20 | loss_vals = loss(D_grid, sigma_grid)
21 | 
22 | fig = plt.figure()
23 | ax = fig.add_subplot(111, projection='3d')
24 | 
25 | surf = ax.plot_surface(D_grid, sigma_grid, loss_vals, cmap='coolwarm', alpha = 0.7)
26 | cntr = ax.contour3D(D_grid, sigma_grid, loss_vals, 100, cmap='coolwarm')
27 | 
28 | sigma_const = 1 / (2 * np.pi)
29 | sigma_vals_off = np.linspace(0.001, sigma_const, num)
30 | 
31 | D_grid_off, sigma_grid_off = np.meshgrid(D_vals, sigma_vals_off)
32 | loss_vals_off = loss(D_grid_off, sigma_grid_off)
33 | loss_vals_off_clipped = np.clip(loss_vals_off, -1, np.max(loss_vals))
34 | 
35 | cntr_off = ax.contour3D(D_grid_off, sigma_grid_off, loss_vals_off_clipped, 100, cmap='coolwarm', alpha=0.5)
36 | 
37 | 
38 | loss_wall = np.linspace(-1, np.max(loss_vals), num)
39 | D, L = np.meshgrid(D_grid, loss_wall)
40 | 
41 | bound = ax.plot_surface(D, np.ones_like(D) * sigma_const, L, facecolor="black", alpha=0.2)
42 | 
43 | sigma_vals_full = np.linspace(0.001, 1, num)
44 | D_grid_full, sigma_grid_full = np.meshgrid(D_vals, sigma_vals_full)
45 | loss_zero = ax.plot_surface(D_grid_full, sigma_grid_full, loss_vals*0, facecolor="black", alpha = 0.2)
46 | 
47 | 
48 | ax.set_xlabel(r'$D^2$')
49 | ax.set_ylabel(r'$\sigma$')
50 | ax.set_zlabel('Loss')
51 | 
52 | ax.set_title("Negative Log-Likelihood Loss")
53 | 
54 | ax.text(-0.2, sigma_const, -1, r'$\frac{1}{2\pi}$', color='red', fontsize=14, ha='center', va='center')
55 | ax.text(-0.2, -0.2, 0, 'min', color='red', fontsize=12, ha='center', va='center')
56 | 
57 | 
58 | ax.set_zlim(-1, np.max(loss_vals))
59 | ax.set_xlim(0, 1)
60 | ax.set_ylim(0, 1)
61 | 
62 | fig.colorbar(surf,  shrink=0.9, pad=0.1, location="left")
63 | 
64 | ax.view_init(30, 164)
65 | 
66 | plt.savefig("loss_graph.png", dpi=600)
67 | 
68 | plt.show()
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/loss_graph_spat.py:
--------------------------------------------------------------------------------
 1 | import tkinter
 2 | import matplotlib
 3 | matplotlib.use('TkAgg')
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | from mpl_toolkits.mplot3d import Axes3D
 8 | from mpl_toolkits import mplot3d
 9 | 
10 | def loss(x, y):
11 |     return np.power(x, 2) + np.power(y, 2)
12 | 
13 | num = 100
14 | 
15 | X_vals = np.linspace(0, 140, num)
16 | Y_vals = np.linspace(0, 120, num)
17 | 
18 | X_grid, Y_grid = np.meshgrid(X_vals, Y_vals)
19 | 
20 | loss_vals = loss(X_grid, Y_grid)
21 | 
22 | fig = plt.figure()
23 | ax = fig.add_subplot(111, projection='3d')
24 | 
25 | surf = ax.plot_surface(X_grid, Y_grid, loss_vals, cmap='coolwarm', alpha = 0.7)
26 | cntr = ax.contour3D(X_grid, Y_grid, loss_vals, 100, cmap='coolwarm')
27 | 
28 | loss_zero = ax.plot_surface(X_grid, Y_grid, loss_vals*0, facecolor="black", alpha = 0.2)
29 | loss_zero = ax.plot_surface(X_grid, Y_grid, loss_vals*0 + 15000, facecolor="black", alpha = 0.2)
30 | 
31 | 
32 | ax.set_xlabel(r'$\Delta Y_{lon}$')
33 | ax.set_ylabel(r'$\Delta Y_{lat}$')
34 | ax.set_zlabel('Loss')
35 | 
36 | ax.set_title("Squared Euclidean Distance")
37 | 
38 | ax.set_zlim(-1, np.max(loss_vals))
39 | ax.set_xlim(0, 140)
40 | ax.set_ylim(0, 120)
41 | 
42 | fig.colorbar(surf,  shrink=0.9, pad=0.1, location="left")
43 | 
44 | ax.view_init(30, 164)
45 | 
46 | plt.savefig("loss_graph_spat.png", dpi=600)
47 | 
48 | plt.show()
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/ner-gazetteer-test.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd   
  2 | import sys
  3 | from shapely.geometry import Point, LineString
  4 | import geopandas as gpd
  5 | from geopandas import GeoDataFrame
  6 | import spacy
  7 | import os.path
  8 | from os import path
  9 | from geopy.geocoders import Nominatim
 10 | from geopy.exc import GeocoderTimedOut
 11 | import geopy.distance
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | train_set = "geo.jsonl"
 15 | temp = "nlp.jsonl"
 16 | test = "test.jsonl"
 17 | result = "result-small.jsonl"
 18 | 
 19 | empty = "empty-twitter-example-big.jsonl"
 20 | 
 21 | nlp_model = "en_core_web_sm"
 22 | world_model = "naturalearth_lowres"
 23 | 
 24 | # test run for NER + gazetteer approach
 25 | 
 26 | 
 27 | def read_json(file):      
 28 |     print("=== Reading data from:", file)
 29 |     try:
 30 |         df = pd.read_json(path_or_buf=file, lines=True)
 31 |         #print(df.head(5))
 32 |         print(df)
 33 |     except:
 34 |         print("=== Can't read data from:", file)
 35 |     
 36 |     return df
 37 | 
 38 | 
 39 | def coords_on_map(df, long="long", lat="lat", clr="red"):
 40 |     geometry = [Point(xy) for xy in zip(df[long], df[lat])]
 41 |     gdf = GeoDataFrame(df, geometry=geometry)   
 42 |     world = gpd.read_file(gpd.datasets.get_path(world_model))
 43 |     
 44 |     gdf.plot(ax=world.plot(color='white', 
 45 |                        edgecolor='black', 
 46 |                        figsize=(20, 16)), 
 47 |     marker='o', 
 48 |     color=clr, 
 49 |     markersize=5);
 50 | 
 51 | 
 52 | def lines_on_map(df, x1="long", y1="lat", x2="long_gc", y2="lat_gc", clr="red"):
 53 |     world_ax = gpd.read_file(gpd.datasets
 54 |                              .get_path(world_model)).plot(
 55 |                                      color='white', 
 56 |                                      edgecolor='yellow', 
 57 |                                      figsize=(20, 16))
 58 |     
 59 |     geometry = [LineString([[x1,y1], [x2,y2]]) for 
 60 |                 x1,y1,x2,y2 in zip(df[x1], df[y1], df[x2], df[y2])]
 61 |     gdf = GeoDataFrame(df, geometry=geometry)   
 62 |     gdf.plot(ax=world_ax, marker=None, color="black", markersize=1);
 63 |     
 64 |     geometry = [Point(xy) for xy in zip(df[x2], df[y2])]
 65 |     gdf = GeoDataFrame(df, geometry=geometry)   
 66 |     gdf.plot(ax=world_ax, marker='o', color=clr, markersize=5, zorder=3)
 67 |     
 68 |     geometry = [Point(xy) for xy in zip(df[x1], df[y1])]
 69 |     gdf = GeoDataFrame(df, geometry=geometry)   
 70 |     gdf.plot(ax=world_ax, marker='o', color="green", markersize=10, zorder=2)
 71 | 
 72 | 
 73 | def spacy_ner(df):
 74 |     if not path.exists("model"):
 75 |         print("=== Local folder of NLP model not found. Downloading")
 76 |         spacy.cli.download(nlp_model)
 77 |         nlp = spacy.load('model') 
 78 |         nlp.to_disk('model')
 79 |     else:
 80 |         nlp = spacy.load('model') 
 81 |     
 82 |     df['gpe'] = None
 83 |     df['loc'] = None
 84 |     df['match_geo'] = False
 85 |     
 86 |     print("=== NER Processing text of tweets to get GPE and LOC")
 87 |     
 88 |     for ind in df.index:
 89 |         doc = nlp(df['text'][ind])
 90 |         for ent in doc.ents:
 91 |             if (ent.label_ == 'GPE'):
 92 |                 gpe = ent.text
 93 |                 df['gpe'][ind] = gpe
 94 |                 if gpe:
 95 |                     df['match_geo'][ind] = str(df['place'][ind]).find(gpe) != -1
 96 |                     
 97 |             if (ent.label_ == 'LOC'):
 98 |                 loc = ent.text
 99 |                 df['loc'][ind] = loc
100 |                 if loc:
101 |                     df['match_geo'][ind] = str(df['place'][ind]).find(loc) != -1
102 |                     
103 |     print("===", str(round(sum(df["match_geo"])/len(df["id"])*100, 2)), 
104 |           "% of ALL processed data match decoded Geo/Loc")
105 |                     
106 |     return df
107 | 
108 | 
109 | def geocoding(df, city_country=True, coords=False):
110 |     geolocator = Nominatim(user_agent="geoapiExercises")
111 |     
112 |     print("OpenStreetMap Geolocator is set up")
113 |     
114 |     def city_country(row):
115 |         try:
116 |             coord = f"{row['lat']}, {row['long']}"
117 |             location = geolocator.reverse(coord, exactly_one=True, timeout=None)
118 |             address = location.raw['address']
119 |             city = address.get('city', '')
120 |             country = address.get('country', '')
121 |             row['city'] = city
122 |             row['country'] = country
123 |         except GeocoderTimedOut as e:
124 |             print("Error: geocode failed on input %s with coordinates %s"%(f"{row['lat']}, {row['long']}", e.message))
125 |         
126 |         if row['city'] is None or row['country'] is None:
127 |             print(row["id"], "not found for Coords:", row['long'], row["lat"], 
128 |                   "==> City:", row['city'], "Country:", row["country"])
129 |         
130 |         return row
131 | 
132 |     def coords(row):
133 |         try:
134 |             location = geolocator.geocode(row['gpe'], timeout=None)
135 |             if location:
136 |                 row['lat_gc'] = location.latitude
137 |                 row['long_gc'] = location.longitude
138 |         except GeocoderTimedOut as e:
139 |             print("Error: geocode failed on input %s with GPE %s"%(row['gpe'], e.message))
140 |             
141 |         if row['long_gc'] is None:
142 |             print(row["id"], "not found for GPE:", row["gpe"], 
143 |                   "==> Long:", row['long_gc'], "Lat:", row["lat_gc"])
144 |         
145 |         return row
146 |     
147 |     if coords:
148 |         print("=== Geocoding GPE to long, lat in progress")
149 |         
150 |         df["long_gc"] = None
151 |         df["lat_gc"] = None
152 |         
153 |         df = df.apply(coords, axis=1)
154 |         
155 |         print("===", str(round(sum(df["long_gc"].notna())/len(df["id"])*100, 2)), 
156 |               "% of processed data attributed coordinates")
157 |         
158 |     if city_country:
159 |         print("=== Reverse geocoding long, lat to city, country in progress")
160 |         
161 |         df["city"] = None
162 |         df["country"] = None
163 |         
164 |         df = df.apply(city_country, axis=1)
165 |         
166 |         print("===", str(round(sum(df["city"].notna())/len(df["id"])*100, 2)), 
167 |               "% of processed data attributed city")
168 |         print("===", str(round(sum(df["country"].notna())/len(df["id"])*100, 2)), 
169 |               "% of processed data attributed country")
170 |     
171 |     return df
172 | 
173 | 
174 | def save_df(file, df):
175 |     with open(file, "w") as f:
176 |         df.to_json(f, orient='records', lines=True)
177 |     print("=== Data saved to file", file)
178 | 
179 | 
180 | def read_args(train=False, empty=False):
181 |     try: # 1 arg - data imput 
182 |       file = sys.argv[1]
183 |     except IndexError:
184 |       file = train_set
185 |       
186 |     if train:
187 |       train_data = read_json(file)
188 |       #coords_on_map(train_data)
189 |     else:
190 |       train_data = None
191 |       
192 |     try: # 2 arg - data imput withut geo
193 |       file = sys.argv[2]
194 |       
195 |     except IndexError:
196 |       file = empty
197 |       
198 |     if empty:
199 |       empty_data = read_json(file)
200 |     else:
201 |       empty_data = None
202 |       
203 |     return train_data, empty_data
204 | 
205 | 
206 | def main():
207 |     train_data, empty_data = read_args(True, False) # load train and empty set
208 |     
209 |     # NER layer
210 |     geo = spacy_ner(train_data)
211 |     geo = geo[geo['gpe'].notna()]
212 |     #print(geo.head(5))
213 |     #print(geo.info())
214 |     print("===", str(round(sum(geo["match_geo"])/len(geo["id"])*100, 2)), 
215 |           "% of SUCCESSFULLY processed data match decoded Geo/Loc")
216 |     save_df(temp, geo)
217 |     
218 |     # Geocoding layer
219 |     geo = read_json(temp)
220 |     df = geocoding(geo, True, True)
221 |     save_df(temp, df)
222 |     
223 |     # Coordinates processing
224 |     df = read_json(temp)
225 |     df = df[df['long_gc'].notna()]
226 |     
227 |     def distance(row):
228 |         dist = geopy.distance.geodesic(
229 |                     (row['lat'],row['long']),(row['lat_gc'],row['long_gc']))
230 |         row["dist"] = dist.meters
231 |         return row
232 |     
233 |     df = df.apply(distance, axis=1)
234 |     print(df.info())
235 |     save_df(result, df)
236 |     
237 |     x=df.loc[df['match_geo']==1, 'dist']
238 |     y=df.loc[df['match_geo']==0, 'dist']
239 |     
240 |     bins=list(range(100))
241 |     
242 |     plt.figure(figsize=(18,8))
243 |     plt.title("Distance of coords mismatch by match in 'place' column")
244 |     plt.hist(x, bins, alpha=0.5, label='true')
245 |     plt.hist(y, bins, alpha=0.5, label='false')
246 |     plt.legend(loc='upper right')
247 |     plt.show()
248 |     
249 |     lines_on_map(df)
250 | 
251 | 
252 | if __name__ == "__main__":
253 |     main()
254 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/old_project_examples/extract_tweets_with_smileys.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import json
  5 | import pyarrow as pa
  6 | import pyarrow.parquet as pq
  7 | import re
  8 | import sys
  9 | 
 10 | POSITIVE_SMILEYS = "😹😅😺😽😂🐱💌🌚😗💏😌💚❤😆👍💑💞😘🎉😛😚💓👌💖💛😙😁🐶😄👏😍💙🏆😊😀♡😻💘😝💗😃😜😇✌😎😋💕😉😸♥👻🌝🙂🤓🤗🤠🤣🤩🤪🥰🥳" # 💯
 11 | NEGATIVE_SMILEYS = "🤨😒🙄😬🤥🤢🤮🥵🥶😵🤯😕😟🙁😮😯😲😳😦😧😨😰😥😢😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿🖕👎😔🙀😾💀😭😪💔😿🙍👺🙎" # removed for ambiguity: 😭🤔🤐✊👊🥺😷💩🆘🥴💧
 12 | 
 13 | def parse_tweet(obj):
 14 |   try:
 15 |     if 'extended_tweet' in obj:
 16 |       txt = obj['extended_tweet']['full_text']
 17 |     else: 
 18 |       txt = obj['text']
 19 |   except KeyError:
 20 |     txt = None
 21 |   
 22 |   idstr = obj['id_str']
 23 |   lang = obj['lang']
 24 |   return idstr,txt,lang
 25 | 
 26 | try:
 27 |   filename = sys.argv[1]
 28 | except IndexError:
 29 |   filename = '/nfs/scistore14/chlgrp/chl/twitter-stream/data/corona/2020/02/corona-2020-02-25.txt'
 30 | 
 31 | positive_smileys_re = re.compile(u'['+POSITIVE_SMILEYS+']')
 32 | negative_smileys_re = re.compile(u'['+NEGATIVE_SMILEYS+']')
 33 | 
 34 | positive_translation_table = str.maketrans("\n\r", "  ", POSITIVE_SMILEYS) 
 35 | negative_translation_table = str.maketrans("\n\r", "  ", NEGATIVE_SMILEYS) 
 36 | 
 37 | def filter(fid, min_length=10, skip_retweets=True):
 38 |   known_ids = set()
 39 |   for line in fid:
 40 |     if not line:
 41 |       continue
 42 |     try:
 43 |       obj = json.loads(line)
 44 |     except (json.decoder.JSONDecodeError, TypeError):
 45 |       #print("ERROR: entry wasn't a dictionary. skipping.", file=sys.stderr)
 46 |       continue
 47 | 
 48 |     try:
 49 |       if 'id_str' not in obj:
 50 |         print("ERROR: 'id' field not found in tweet", file=sys.stderr)
 51 |         continue
 52 |       if 'created_at' not in obj:
 53 |         print("ERROR: 'created_at' field not found in tweet {}".format(tweet['id']), file = sys.stderr)
 54 |         continue
 55 |       if 'retweeted_status' in obj and skip_retweets: # skip retweets
 56 |         continue 
 57 |     except TypeError:
 58 |         print("ERROR: not a dict?", line, obj, file=sys.stderr)
 59 |         continue 
 60 | 
 61 |     idstr, txt, lang = parse_tweet(obj)
 62 |     if not txt: # no text
 63 |       continue
 64 |     if idstr in known_ids: # duplicate
 65 |       continue
 66 |     known_ids.add(idstr)
 67 |     
 68 |     pos = re.findall(positive_smileys_re, txt)
 69 |     neg = re.findall(negative_smileys_re, txt)
 70 |     if not pos and not neg:
 71 |       continue # drop, because no smiley
 72 |     if pos and neg:
 73 |       continue # drop, because confusing
 74 |     if pos and not neg:
 75 |       txt = txt.translate(positive_translation_table)
 76 |       label = 1
 77 |     elif neg and not pos:
 78 |       txt = txt.translate(negative_translation_table)
 79 |       label = 0
 80 |     
 81 |     if len(txt)<min_length:
 82 |       continue
 83 | 
 84 |     smileys = ''.join(pos+neg)
 85 |     yield (idstr, txt, label, lang, smileys)
 86 | 
 87 | # now load data and process in batches 
 88 | # (note: batches might not be needed anymore)
 89 | 
 90 | bs=128
 91 | 
 92 | with open(filename, encoding='utf-8') as fid:
 93 |   lines = filter(fid)
 94 |   idstr,texts,labels,langs,smileys = zip(*lines)
 95 | 
 96 | print("Creating pyarrow table ")
 97 | data = {'id':idstr, 'text':texts, 'label':labels, 'lang':langs, 'smileys':smileys}
 98 | table = pa.Table.from_pydict(data)
 99 | 
100 | try:
101 |   output_filename = sys.argv[2]
102 | except IndexError:
103 |   output_filename = filename+'.parquet'
104 | 
105 | print("Writing to {}".format(output_filename))
106 | 
107 | pq.write_table(table, output_filename)
108 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/old_project_examples/train_sentiment_classifier.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import argparse
  5 | import numpy as np
  6 | import os
  7 | import pandas as pd
  8 | #import pyarrow.parquet as pq
  9 | import sys
 10 | import time
 11 | import torch
 12 | 
 13 | from torch import nn
 14 | 
 15 | try:
 16 |   from tqdm import tqdm
 17 | except ImportError:
 18 |   tqdm = lambda x: x
 19 | 
 20 | from functools import partial
 21 | from sklearn.model_selection import train_test_split
 22 | from sklearn.metrics import accuracy_score, roc_auc_score
 23 | from scipy.special import softmax
 24 | 
 25 | from tweet_utils import *
 26 | 
 27 | import transformers
 28 | from transformers import XLMRobertaTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
 29 | from transformers.optimization import Adafactor, get_cosine_with_hard_restarts_schedule_with_warmup #AdafactorSchedule
 30 | #transformers.logging.set_verbosity_info()
 31 | 
 32 | class LinearHead(nn.Module):
 33 |    def __init__(self, hidden_size, num_labels):
 34 |      super(LinearHead, self).__init__()
 35 |      self.out_proj = nn.Linear(hidden_size, num_labels)
 36 | 
 37 |    def forward(self, features, **kwargs):
 38 |      x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
 39 |      x = self.out_proj(x)
 40 |      return x
 41 | 
 42 | def main():
 43 |     parser = argparse.ArgumentParser(description='Finetune multilingual transformer model')
 44 |     parser.add_argument('-o','--output_file', dest='output_file', type=str, default=None, help='Output file for predictions.')
 45 |     parser.add_argument('-n','--nepochs', type=int, default=1, help='Number of epochs to train')
 46 |     parser.add_argument('-m','--modelname', type=str, default='models/model.bin', help='Output filename for model (leave empty for not saving)')
 47 |     parser.add_argument('-t','--tokenizername', type=str, default='models/tokenizer.bin', help='Output filename for tokenizer (leave empty for not saving)')
 48 |     parser.add_argument('-T','--nthreads', type=int, default=None, help='Number of threads for tokenizing (default: OMP_NUM_THREADS)')
 49 |     parser.add_argument('-s','--seed', type=int, default=0, help='Random seed (default: 0)')
 50 |     parser.add_argument('-b','--batchsize', type=int, default=24, help='Per-device batchsize (default: 24)')
 51 |     parser.add_argument('-l','--linear', action="store_true", help="Train only linear model")
 52 |     parser.add_argument('--fp16', action="store_true", help="Use fp16 mixed precision")
 53 |     parser.add_argument('--fp32', dest='fp16', action="store_false", help="Use fp32 precision")
 54 |     parser.add_argument('-L','--lrscheduler', type=str, default="cosine", choices=['linear', 'cosine', 'cosine_with_restarts','polynomial','constant','constant_with_warmup'], help="Learning Rate Scheduler")
 55 |     parser.add_argument('files', nargs="+", type=str, metavar='INPUTFILES', help="Input files (in parquet format)")
 56 |     args = parser.parse_args()
 57 | 
 58 |     if not args.files:
 59 |       print("Usage: {} INPUTFILES".format(sys.argv[0]))
 60 |       raise SystemExit
 61 | 
 62 |     print(f"Loading {len(args.files)} data files.")
 63 |     df = load_data(args.files)
 64 | 
 65 |     tokenizer = XLMRobertaTokenizer.from_pretrained('sentence-transformers/stsb-xlm-r-multilingual')
 66 |     if args.tokenizername:
 67 |       print(f"Writing tokenizer {args.tokenizername}")
 68 |       tokenizer.save_pretrained(args.tokenizername)
 69 | 
 70 |     num_workers = get_num_workers(args.nthreads)
 71 |     print(f"Using pool with {num_workers} workers for tokenization.")
 72 |     tokenized_results = tokenize_dataset(df, tokenizer=tokenizer, num_workers=num_workers)
 73 | 
 74 |     # insert results 
 75 |     df['input_ids'] = sum([part['input_ids'] for part in tokenized_results],[])
 76 |     df['attention_mask'] = sum([part['attention_mask'] for part in tokenized_results],[])
 77 | 
 78 |     val_size = min(len(df)//10, 10000)
 79 |     df_train, df_val = train_test_split(df, test_size=val_size, random_state=args.seed)
 80 |     del df
 81 |     data_train = TwitterDataset(df_train)
 82 |     del df_train
 83 |     data_val = TwitterDataset(df_val)
 84 |     del df_val
 85 | 
 86 |     if args.linear:
 87 |       # create model with frozen sentence embedding part
 88 |       with torch.no_grad():
 89 |         model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/stsb-xlm-r-multilingual')
 90 |         for param in model.parameters():
 91 |           param.requires_grad = False
 92 |       model.classifier = LinearHead(hidden_size=768, num_labels=2)
 93 |     else:
 94 |       model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/stsb-xlm-r-multilingual')
 95 |  
 96 |     num_gpus = max(1, torch.cuda.device_count())
 97 |     print(f"Using {num_gpus} devices to train.")
 98 |     training_args = TrainingArguments(
 99 |       output_dir='./results/',
100 |       report_to = "all",
101 |       adafactor=True,                  # not using AdamW, let's see
102 |       learning_rate=5e-5,               # default is 5e-5
103 |       num_train_epochs=args.nepochs,
104 |       per_device_train_batch_size=args.batchsize,
105 |       per_device_eval_batch_size=args.batchsize,
106 |       #warmup_steps=500,                # number of warmup steps for learning rate scheduler
107 |       warmup_steps=0.1,                # number of warmup steps for learning rate scheduler
108 |       #lr_scheduler_type=args.lrscheduler,
109 |       #weight_decay=0.0,               # strength of weight decay
110 |       logging_dir='./logs/',            # directory for storing logs
111 |       logging_steps=100,               # log often
112 |       evaluation_strategy="steps",
113 |       eval_steps=500,                  # evaluate often
114 |       save_strategy="epoch",            # save rarely
115 |       fp16 = args.fp16,
116 |       #fp16_full_eval=True
117 |     )
118 |     num_steps_per_epoch = len(data_train)//(args.batchsize*num_gpus)
119 |     num_updates = num_steps_per_epoch * args.nepochs
120 |     if num_updates > 5000:
121 |       training_args.eval_steps = 1000 # evaluate less often for big datasets
122 |     if num_updates < 100:
123 |       training_args.logging_steps = 1
124 |       training_args.eval_steps = num_updates//5
125 |     elif num_updates < 1000:
126 |       training_args.logging_steps = 10
127 |       training_args.eval_steps = num_updates//10
128 | 
129 |     optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=5e-5)
130 |     lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, 
131 |                                                                       num_warmup_steps=0,
132 |                                                                       num_training_steps=num_updates, 
133 |                                                                       num_cycles=args.nepochs)
134 |     
135 |     trainer = Trainer(
136 |       model=model,
137 |       args=training_args,
138 |       train_dataset=data_train,
139 |       eval_dataset=data_val,
140 |       compute_metrics=compute_metrics,
141 |       optimizers=(optimizer, lr_scheduler)
142 |     )
143 |     trainer.train()
144 |     trainer.evaluate()
145 | 
146 |     del data_train, data_val
147 | 
148 |     if args.modelname:
149 |       print(f"Writing final model {args.modelname}")
150 |       trainer.save_model(args.modelname)
151 | 
152 | if __name__ == "__main__":
153 |     main()
154 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/old_project_examples/tweet_utils.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | import multiprocessing
 3 | import numpy as np
 4 | import os
 5 | import pandas as pd
 6 | import pyarrow.parquet as pq
 7 | from sklearn.metrics import accuracy_score, roc_auc_score
 8 | import torch
 9 | 
10 | def load_parquet(filename):
11 |   df = pq.read_table(filename).to_pandas()
12 |   df.set_index('id')
13 |   return df
14 | 
15 | def load_data(filenames):
16 |   df = pd.concat(load_parquet(f) for f in filenames)
17 |   return df
18 | 
19 | class TwitterDataset(torch.utils.data.Dataset):
20 |   def __init__(self, df):
21 |     self.input_ids = df.input_ids.to_numpy()
22 |     self.attention_mask = df.attention_mask.to_numpy()
23 |     self.labels = df.label.to_numpy()
24 | 
25 |   def __getitem__(self, idx):
26 |     item = {}
27 |     item['input_ids'] = torch.tensor(self.input_ids[idx])
28 |     item['attention_mask'] = torch.tensor(self.attention_mask[idx])
29 |     item['labels'] = torch.tensor(self.labels[idx])
30 |     return item
31 | 
32 |   def __len__(self):
33 |     return len(self.labels)
34 | 
35 | def compute_metrics(pred):
36 |   labels = pred.label_ids
37 |   scores = pred.predictions[:,1]-pred.predictions[:,0] # logits, so not normalized
38 |   preds = pred.predictions.argmax(axis=-1)
39 |   acc = accuracy_score(labels, preds)
40 |   auc = roc_auc_score(labels, scores)
41 |   return { 'accuracy': acc, 'auc': auc }
42 | 
43 | def tokenize(tokenizer, df):
44 |   return tokenizer.batch_encode_plus(df.text, padding="max_length", max_length=160, truncation=True)
45 | 
46 | def tokenize_dataset(df, tokenizer, num_workers):
47 |   print(f"Will use pool with {num_workers} workers for tokenization.")
48 |   with multiprocessing.Pool(num_workers) as pool:
49 |     chunks = np.array_split(df, num_workers)
50 |     tokenized_results = pool.map(partial(tokenize, tokenizer), chunks)
51 |   return tokenized_results
52 | 
53 | def get_num_workers(max_nthreads):
54 |   num_workers = max(1, multiprocessing.cpu_count()-1)
55 |   if max_nthreads:
56 |     num_workers = min(num_workers, max_nthreads)
57 |   else:
58 |     num_workers = min(num_workers, int(os.environ.get("OMP_NUM_THREADS", 1000000)))
59 |   return num_workers
60 | 


--------------------------------------------------------------------------------
/supplementary_resources/scripts/python/transformers-tutorial-test.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | import numpy as np
 3 | from datasets import load_metric
 4 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
 5 | from torch import nn
 6 | import torch
 7 | 
 8 | # test tutorial transformers run
 9 | 
10 | dataset = load_dataset("yelp_review_full")
11 | 
12 | print(dataset["train"][100])
13 | dataset_df = dataset["train"].to_pandas()
14 | print(dataset_df.head())
15 | 
16 | features = dataset["train"].features
17 | print(features)
18 | 
19 | print(dataset_df["label"].value_counts(normalize=True).sort_index())
20 | dataset = dataset.rename_column("label", "labels")
21 | 
22 | tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
23 | 
24 | 
25 | def tokenize_function(examples):
26 |     return tokenizer(examples["text"], padding="max_length", truncation=True)
27 | 
28 | 
29 | tokenized_datasets = dataset.map(tokenize_function, batched=True)
30 | print(tokenized_datasets)
31 | 
32 | small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
33 | small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
34 | 
35 | model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
36 | metric = load_metric("accuracy")
37 | 
38 | 
39 | def compute_metrics(eval_pred):
40 |     logits, labels = eval_pred
41 |     predictions = np.argmax(logits, axis=-1)
42 |     return metric.compute(predictions=predictions, references=labels)
43 | 
44 | 
45 | class_weights = (1 - (dataset_df["label"].value_counts().sort_index() / len(dataset_df))).values
46 | print(class_weights)
47 | class_weights = torch.from_numpy(class_weights).float()
48 | print(class_weights)
49 | 
50 | 
51 | class WeightedLossTrainer(Trainer):
52 |     def compute_loss(selfself, model, inputs, return_outputs=False):
53 |         outputs = model(**inputs)
54 |         logits = outputs.get("logits")
55 |         labels = inputs.get("labels")
56 |         loss_func = nn.CrossEntropyLoss(weight=class_weights)
57 |         loss = loss_func(logits, labels)
58 |         return (loss, outputs) if return_outputs else loss
59 | 
60 | 
61 | batch_size = 8
62 | logging_steps = len(dataset["train"]) // batch_size
63 | output_dir = "test_trainer"
64 | training_args = TrainingArguments(output_dir=output_dir,
65 |                                   num_train_epochs=3,
66 |                                   learning_rate=2e-5,
67 |                                   per_device_train_batch_size=batch_size,
68 |                                   per_device_eval_batch_size=batch_size,
69 |                                   weight_decay=0.01,
70 |                                   logging_steps=logging_steps,
71 |                                   evaluation_strategy="epoch")
72 | 
73 | trainer = WeightedLossTrainer(
74 |     model=model,
75 |     args=training_args,
76 |     train_dataset=small_train_dataset,
77 |     eval_dataset=small_eval_dataset,
78 |     compute_metrics=compute_metrics,
79 | )
80 | 
81 | trainer.train()
82 | 
83 | trainer.evaluate()
84 | 
85 | trainer.save_model('save_test/model')
86 | # alternative saving method and folder
87 | model.save_pretrained('saving_test')
88 | 


--------------------------------------------------------------------------------
/train_bert.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from transformers import BertTokenizer
  3 | from utils.model_trainer import *
  4 | 
  5 | # Entry point for training and evaluation of the models
  6 | 
  7 | f = ["GEO", "NON-GEO", "META-DATA", "TEXT-ONLY", "GEO-ONLY", "USER-ONLY"]
  8 | 
  9 | dataset_file = "test-3877395-filtered-16219676-us-twitter-2021.jsonl"  # .jsonl
 10 | features = [f[1], f[4]]
 11 | val_f = f[1]  # None -> features[0]
 12 | target_columns = ["lon", "lat"]
 13 | 
 14 | original_worldwide_model = "bert-base-multilingual-cased"
 15 | original_usa_model = "bert-base-cased"
 16 | 
 17 | # parameters = dict(
 18 | #     max_lr = [5e-5, 1e-5],
 19 | #     min_lr = [5e-6, 1e-6, 1e-8, 1e-16],
 20 | #     scheduler = ["cosine", "plateau"]
 21 | # )
 22 | # param_values = [v for v in parameters.values()]
 23 | 
 24 | covariance_types = [None, "spher"]  # [None, "full", "spher", "diag", "tied"]
 25 | scheduler_types = ["cosine", "linear", "plateau"]  # ["cosine", "linear", "cosine-long", "plateau", "step", "multi step", "one cycle", "cyclic"]
 26 | 
 27 | loss_distance = True
 28 | loss_mf = "mean"  # mean/sum - mean if features > 1
 29 | loss_prob = "pos"  # all/pos - pos if prob
 30 | loss_total = "mean"  # sum/mean/type - mean if prob else type (spat)
 31 | 
 32 | outcomes = 5
 33 | covariance = covariance_types[1]  # None/spher
 34 | 
 35 | epochs = 3
 36 | log_step = 1000
 37 | 
 38 | batch_size = 4
 39 | 
 40 | lr_max = 1e-5
 41 | lr_min = 1e-6
 42 | scheduler = scheduler_types[0]
 43 | 
 44 | val_size = 1000  # samples/users if -vu
 45 | threshold = 100
 46 | 
 47 | train_size = 0
 48 | test_ratio = 0.1
 49 | seed = 42
 50 | 
 51 | ref_file = None #  "us-twitter-2020.jsonl"  # if not None exclude found users from the current data
 52 | bot_filter = False
 53 | 
 54 | 
 55 | def main():
 56 |     parser = argparse.ArgumentParser(description='Finetune multilingual transformer model')
 57 |     parser.add_argument('-n', '--nepochs', type=int, default=epochs, help='Number of epochs to train')
 58 |     parser.add_argument('-ss', '--skip', type=int, default=0, help='Number of dataset samples to skip')
 59 | 
 60 |     parser.add_argument('-sc', '--scale_coord', action="store_true", help="Keep coordinates unscaled (default: True)")
 61 |     parser.add_argument('-o', '--outcomes', type=int, default=outcomes, help="Number of outcomes (lomg, lat) per tweet")
 62 |     parser.add_argument('-c', '--covariance', type=str, default=covariance, help="Covariance matrix type")
 63 |     parser.add_argument('-nw', '--weighted', action="store_false", help="Weights of GMM are not equal (default: True)")
 64 | 
 65 |     parser.add_argument('-ld', '--loss_dist', action="store_false", help="Distance loss criterion (default: True)")
 66 |     parser.add_argument('-lmf', '--loss_mf', type=str, default=loss_mf, help="Multi feature loss handle mean or sum (default: mean)")
 67 |     parser.add_argument('-lp', '--loss_prob', type=str, default=loss_prob, help="Probabilistic loss domain all or pos (default: all)")
 68 |     parser.add_argument('-lt', '--loss_total', type=str, default=loss_total, help="Total loss handle by model type or sum (default: type)")
 69 | 
 70 |     parser.add_argument('-m', '--local_model', type=str, default=None, help='Filename prefix of local model')
 71 |     parser.add_argument('--nockp', action="store_false", help='Saving model checkpoints during training (preset: True)')
 72 | 
 73 |     parser.add_argument('-lr', '--learn_rate', type=float, default=lr_max, help='Learning rate (default: 4e-5)')
 74 |     parser.add_argument('-lrm', '--learn_rate_min', type=float, default=lr_min, help='Learning rate minimum (default: 1e-8)')
 75 |     parser.add_argument('-sdl', '--scheduler', type=str, default=scheduler, help="Scheduler type")
 76 | 
 77 |     parser.add_argument('-b', '--batch_size', type=int, default=batch_size, help='Per-device batch size (default: 22)')
 78 |     parser.add_argument('-ls', '--log_step', type=int, default=log_step, help='Log step (default: 1000)')
 79 | 
 80 |     parser.add_argument('-us', '--usa_model', action="store_true", help="Use USA model instead of worldwide (default: False)")
 81 |     parser.add_argument('-d', '--dataset', type=str, default=dataset_file, help="Input dataset (in jsonl format)")
 82 |     parser.add_argument('-f', '--features', default=features, nargs='+', help="Features names")
 83 |     parser.add_argument('-ts', '--train_size', type=int, default=train_size, help='Training dataloader size')
 84 |     parser.add_argument('-tr', '--test_ratio', type=float, default=test_ratio, help='Training dataloader test ratio (default: 0.1)')
 85 |     parser.add_argument('-s', '--seed', type=int, default=seed, help='Random seed (default: 42)')
 86 |     parser.add_argument('-v', '--val_size', type=int, default=val_size, help='Validation dataloader size')
 87 |     parser.add_argument('-th', '--threshold', type=int, default=threshold, help='Validation threshold in km (default: 200)')
 88 |     parser.add_argument('-vu', '--val_user', action="store_true", help="Form validation dataset by user (default: False)")
 89 | 
 90 |     parser.add_argument('--train', action="store_true", help="Start pretraining")
 91 |     parser.add_argument('--eval', action="store_true", help="Start evaluation")
 92 |     parser.add_argument('--hptune', action="store_true", help="Start training with hyper parameters tuning")
 93 |     args = parser.parse_args()
 94 | 
 95 |     if args.local_model is None:
 96 |         prefix = f"{'US-' if args.usa_model else ''}{'U-' if not args.scale_coord else ''}{'+'.join(args.features)}-O{args.outcomes}-{'d' if  args.loss_dist else 'c'}-" \
 97 |                  f"total_{args.loss_total if args.covariance is not None else 'type'}-{'mf_' + args.loss_mf + '-' if len(args.features) > 1 else ''}" \
 98 |                  f"{args.loss_prob + '_' if args.covariance is not None else ''}{args.covariance if args.covariance is not None else 'NP'}-" \
 99 |                  f"{'weighted-' if args.weighted and args.outcomes > 1 else ''}N{args.train_size//100000}e5-" \
100 |                  f"B{args.batch_size}-E{args.nepochs}-{args.scheduler}-LR[{args.learn_rate};{args.learn_rate_min}]"
101 |     else:
102 |         prefix = args.local_model
103 | 
104 |     print(f"Model prefix:\t{prefix}")
105 |     if torch.cuda.is_available():
106 |         print(f"DEVICE\tAvailable GPU has {torch.cuda.device_count()} devices, using {torch.cuda.get_device_name(0)}")
107 |         print(f"DEVICE\tCPU has {torch.get_num_threads()} threads")
108 |     else:
109 |         print(f"DEVICE\tNo GPU available, using the CPU with {torch.get_num_threads()} threads instead.")
110 | 
111 |     original_model = original_usa_model if args.usa_model else original_worldwide_model
112 | 
113 | #    combine_datasets(["test-10501727-filtered-17174594-worldwide-twitter-2020_0.jsonl", "test-10586286-filtered-17264575-worldwide-twitter-2020_1.jsonl", "test-3783510-filtered-6464689-worldwide-twitter-2020_2.jsonl"], "test-filtered-worldwide-twitter-2020.jsonl")
114 | 
115 |     dataloader = TwitterDataloader(args.dataset,
116 |                                    args.features,
117 |                                    target_columns,
118 |                                    BertTokenizer.from_pretrained(original_model),
119 |                                    args.seed,
120 |                                    args.scale_coord,
121 |                                    val_f,
122 |                                    bot_filter)
123 | 
124 | #  no settings run to save filtered by condition dataset copy
125 | #    dataloader.filter_dataset("code", "US", None)
126 | 
127 |     trainer = ModelTrainer(prefix,
128 |                            dataloader,
129 |                            args.nepochs,
130 |                            args.batch_size,
131 |                            args.outcomes,
132 |                            args.covariance,
133 |                            args.weighted,
134 |                            args.loss_dist,
135 |                            args.loss_mf,
136 |                            args.loss_prob,
137 |                            args.loss_total,
138 |                            args.learn_rate,
139 |                            args.learn_rate_min,
140 |                            original_model)
141 | 
142 |     # if args.hptune:
143 |     #     trainer.hp_tuning(args.train_size,
144 |     #                       args.test_ratio,
145 |     #                       param_values,
146 |     #                       args.log_step)
147 | 
148 |     if args.train:
149 |         trainer.finetune(args.train_size,
150 |                          args.test_ratio,
151 |                          f"{prefix}.pth",
152 |                          args.nockp,
153 |                          args.log_step,
154 |                          args.scheduler,
155 |                          args.skip)
156 | 
157 |     if args.eval:
158 |         trainer.eval(args.val_size,
159 |                      args.threshold,
160 |                      args.val_size,
161 |                      args.val_user,
162 |                      args.train_size,
163 |                      ref_file)
164 | 
165 | 
166 | if __name__ == "__main__":
167 |     main()
168 | 


--------------------------------------------------------------------------------
/utils/benchmarks.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import torch
  3 | import torch.distributions as dist
  4 | import numpy as np
  5 | from utils.regressor import *
  6 | 
  7 | # model benchmarks for loss and metrics
  8 | 
  9 | 
 10 | # R2 score
 11 | def r2_score(X, Y):
 12 |     labels_mean = torch.mean(Y)
 13 |     ss_tot = torch.sum((Y - labels_mean) ** 2)
 14 |     ss_res = torch.sum((Y - X) ** 2)
 15 |     r2 = 1 - ss_res / ss_tot
 16 |     return r2
 17 | 
 18 | 
 19 | # spher cov from raw outputs
 20 | def spher_sigma(X, outcomes, outputs_map, lower_limit=0):
 21 |     softplus = nn.Softplus()
 22 |     S = softplus(X[:, outputs_map["sigma"][0]:outputs_map["sigma"][1]]) + lower_limit
 23 |     return S.reshape([X.size(dim=0), outcomes])
 24 | 
 25 | 
 26 | # GM/GMM from raw outputs
 27 | def GaussianModel(X, outcomes, outputs_map, prob_domain, cov):
 28 |     softplus = nn.Softplus()
 29 |     batch = X.size(dim=0)
 30 | 
 31 |     means = X[:, outputs_map["coord"][0]:outputs_map["coord"][1]]
 32 |     if outcomes > 1:
 33 |         means = means.reshape([batch, outcomes, 2])
 34 | 
 35 |     sigma_lower_limit = 1 / (2 * math.pi) if prob_domain == "pos" else 0
 36 |     positive_sigma = spher_sigma(X, outcomes, outputs_map, sigma_lower_limit)
 37 | 
 38 |     sigma = None
 39 |     tril = None
 40 | 
 41 |     if cov == "spher":
 42 |         sigma = torch.eye(2, device=X.device) * positive_sigma.reshape(-1, 1)[:, None]
 43 |     elif cov == "diag":
 44 |         sigma = torch.eye(2, device=X.device) * positive_sigma.reshape(-1, 2)[:, None]
 45 |     else:
 46 |         tril_indices = torch.tril_indices(row=2, col=2, offset=0, device=X.device)
 47 |         if cov == "tied" or outcomes == 1:
 48 |             tril = torch.zeros((2, 2), device=X.device).repeat(batch, 1).reshape([batch, 2, 2])
 49 |             tril[:, tril_indices[0], tril_indices[1]] = positive_sigma.reshape([batch, 3])
 50 |         else:
 51 |             tril = torch.zeros((2, 2), device=X.device).repeat(batch, outcomes).reshape([batch, outcomes, 2, 2])
 52 |             tril[:, :, tril_indices[0], tril_indices[1]] = positive_sigma.reshape([batch, outcomes, 3])
 53 | 
 54 |     if sigma is not None:
 55 |         if outcomes > 1:
 56 |             sigma = sigma.reshape([batch, outcomes, 2, 2])
 57 |         gaussian = dist.MultivariateNormal(means, sigma)
 58 |     else:
 59 |         if outcomes > 1 and cov == "tied":
 60 |             tril = tril.reshape(batch, -1).repeat(1, outcomes).reshape([batch, outcomes, 2, 2])
 61 |         gaussian = dist.MultivariateNormal(means, scale_tril=tril)
 62 | 
 63 |     return gaussian
 64 | 
 65 | 
 66 | # GMM weights from raw outputs
 67 | def GaussianWeights(X, outcomes, outputs_map):
 68 |     softmax = nn.Softmax(dim=1)
 69 |     weights = X[:, outputs_map["weight"][0]:outputs_map["weight"][1]].reshape([X.size(dim=0), outcomes]) if outputs_map["weight"] else torch.ones((X.size(dim=0), outcomes), device=X.device)
 70 |     gmm_weights = dist.Categorical(softmax(weights))
 71 |     return gmm_weights
 72 | 
 73 | 
 74 | # spatial weights from raw outputs
 75 | def weights(X, outcomes, outputs_map):
 76 |     softmax = nn.Softmax(dim=1)
 77 |     W = X[:, outputs_map["weight"][0]:outputs_map["weight"][1]].reshape([X.size(dim=0), outcomes]) if outputs_map["weight"] else torch.ones((X.size(dim=0), outcomes), device=X.device)
 78 |     return softmax(W)
 79 | 
 80 | 
 81 | # Distance^2 to genuine truth from raw outputs
 82 | def dist2(X, y, outcomes, outputs_map):
 83 |     def coord_se(X, y, outcomes):
 84 |         error_by_coord = nn.MSELoss(reduction='none')
 85 |         Y = y.repeat(1, outcomes)
 86 |         E = error_by_coord(X, Y)
 87 |         return E
 88 | 
 89 |     E = coord_se(X[:, outputs_map["coord"][0]:outputs_map["coord"][1]], y, outcomes)
 90 |     D = torch.zeros((X.size(dim=0), 0), device=X.device)
 91 |     for i in range(outputs_map["coord"][0], outputs_map["coord"][1], 2):
 92 |         D = torch.cat((D, torch.sum(E[:, i:i+2], dim=1, keepdim=True)), 1)
 93 |     return D
 94 | 
 95 | 
 96 | # Negative Log Likelihood fit for genuine truth from raw outputs
 97 | def lh_loss(X, y, outcomes, outputs_map, prob_domain):
 98 |     gaussian = GaussianModel(X, outcomes, outputs_map, prob_domain, "spher")
 99 |     if outcomes > 1:
100 |         gmm_weights = GaussianWeights(X, outcomes, outputs_map)
101 |         gmm = dist.MixtureSameFamily(gmm_weights, gaussian)
102 |         L = -gmm.log_prob(y)
103 |     else:
104 |         L = -gaussian.log_prob(y)
105 |     return L
106 | 
107 | 
108 | # weighted D2 loss from raw outputs
109 | def d_loss(X, y, outcomes, outputs_map):
110 |     D = dist2(X, y, outcomes, outputs_map)
111 |     if outcomes > 1:
112 |         W = weights(X, outcomes, outputs_map)
113 |         L = torch.sum(D * W, dim=1)
114 |     else:
115 |         L = D
116 |     return L
117 | 
118 | 
119 | class ModelBenchmark():
120 |     def __init__(self, model, distance=True, loss_prob="pos", mf_loss="mean", total_loss="type"):
121 |         self.model = model
122 |         self.dist = distance
123 |         self.prob_domain = loss_prob
124 |         self.mf_handle = mf_loss
125 |         self.total_loss_crit = total_loss
126 | 
127 |         self.outcomes = self.model.n_outcomes
128 |         self.cov = self.model.cov
129 |         self.weighted = self.model.weighted
130 |         self.features = self.model.features
131 | 
132 |         self.outputs_map = {
133 |             "coord": [0, self.model.coord_output],
134 |             "weight": [self.model.coord_output, self.model.coord_output + self.model.weights_output] if self.weighted else None,
135 |             "sigma": [self.model.coord_output + self.model.weights_output, self.model.coord_output + self.model.weights_output + self.model.cov_output] if self.cov else None
136 |         }
137 | 
138 |         self.single_outputs_map = {
139 |             "coord": [0, 2],
140 |             "weight": None,
141 |             "sigma": [2, 3] if self.cov else None
142 |         }
143 | 
144 |         self.loss_type = 1 if self.outputs_map["sigma"] else 0
145 | 
146 |         print(f"TRAIN\tLOSS\tKey Feature - {'sum of spat and prob' if self.total_loss_crit == 'sum' else 'by model type'}:\n"
147 |               f"\tGeospatial accuracy:\t{'weighted ' if self.weighted else ''}{'distance' if self.dist else 'coord'} error^2 for {self.outcomes} outcome(s)")
148 | 
149 |         if self.outputs_map["sigma"] is not None:
150 |             print(f"\tProbability accuracy:\t {'limited' if self.prob_domain == 'pos' else 'unlimited'} -LLH for PDF of {'weighted ' if self.weighted else ''}"
151 |                   f"{'GM' if self.outcomes == 1 else 'GMM'} with {self.cov} covariance matrix ")
152 | 
153 |         if len(self.features) > 1:
154 |             print(f"TRAIN\tLOSS\tMinor Features - {self.mf_handle} of:\n\tGeospatial accuracy:\tsingle {'distance' if self.dist else 'coord'} error^2")
155 |             if self.outputs_map["sigma"]:
156 |                 print(f"\tProbability accuracy:\t {'limited' if self.prob_domain == 'pos' else 'unlimited'} -LLH for PDF of single GM with spher covariance matrix ")
157 | 
158 |     def minor_feature_loss(self, outputs, labels):
159 |         X = outputs.squeeze().float()
160 |         y = labels.squeeze().float()
161 | 
162 |         if X.dim() == 1:
163 |             X = X.reshape(1, -1)
164 | 
165 |         spat_loss = d_loss(X, y, 1, self.single_outputs_map).mean()
166 |         prob_loss = torch.zeros_like(spat_loss, device=X.device)
167 |         if self.outputs_map["sigma"]:
168 |             prob_loss = lh_loss(X, y, 1, self.single_outputs_map, self.prob_domain).mean()
169 |         return spat_loss, prob_loss
170 | 
171 |     def key_feature_loss(self, outputs, labels):
172 |         X = outputs.squeeze().float()
173 |         y = labels.squeeze().float()
174 | 
175 |         if X.dim() == 1:
176 |             X = X.reshape(1, -1)
177 | 
178 |         spat_loss = d_loss(X, y, self.outcomes, self.outputs_map).mean()
179 |         prob_loss = torch.zeros_like(spat_loss, device=X.device)
180 |         if self.outputs_map["sigma"]:
181 |             prob_loss = lh_loss(X, y, self.outcomes, self.outputs_map, self.prob_domain).mean()
182 |         return spat_loss, prob_loss
183 | 
184 |     def total_batch_loss(self, batch_loss):
185 |         all_features_loss = torch.mean(batch_loss, dim=0) if self.mf_handle == "mean" else torch.sum(batch_loss, dim=0)
186 |         if self.total_loss_crit == "sum":
187 |             total_loss = torch.sum(all_features_loss, dim=0)
188 |         elif self.total_loss_crit == "mean":
189 |             total_loss = torch.mean(all_features_loss, dim=0)
190 |         elif self.total_loss_crit == "type":
191 |             total_loss = all_features_loss[1 if self.outputs_map["sigma"] else 0]
192 |         return total_loss
193 | 
194 |     # pytorch GM/GMM from raw outputs
195 |     def prob_models(self, outputs):
196 |         X = outputs.squeeze().float()
197 | 
198 |         if X.dim() == 1:
199 |             X = X.reshape(1, -1)
200 | 
201 |         gaussian = GaussianModel(X, self.outcomes, self.outputs_map, self.prob_domain, self.cov)
202 |         if self.outcomes > 1:
203 |             gmm_weights = GaussianWeights(X, self.outcomes, self.outputs_map)
204 |             return dist.MixtureSameFamily(gmm_weights, gaussian)
205 |         else:
206 |             return gaussian
207 | 
208 |     # spat and prob loss from raw outputs
209 |     def result_metrics(self, outputs, labels):
210 |         X = outputs.squeeze().float()
211 |         y = labels.squeeze().float()
212 | 
213 |         if X.dim() == 1:
214 |             X = X.reshape(1, -1)
215 | 
216 |         spat_loss = d_loss(X, y, self.outcomes, self.outputs_map).reshape(-1, 1)
217 |         prob_loss = torch.zeros_like(spat_loss, device=X.device)
218 |         if self.outputs_map["sigma"]:
219 |             prob_loss = lh_loss(X, y, 1, self.single_outputs_map, self.prob_domain).reshape(-1, 1)
220 | 
221 |         return spat_loss, prob_loss
222 | 
223 |     def r2(self, outputs, labels):
224 |         if outputs.dim() == 1:
225 |             outputs = outputs.reshape(1, -1)
226 | 
227 |         Y = labels.repeat(1, self.outcomes) if self.outcomes > 1 else labels
228 |         X = outputs[:, self.outputs_map["coord"][0]:self.outputs_map["coord"][1]]
229 |         r2 = r2_score(X, Y)
230 |         return r2
231 | 
232 |     # tensorboard metrics logging
233 |     def log(self, writer, step, lr, train_metric, cur_batch, val_metric=None):
234 |         def total_loss_log(metric, metric_type="total"):
235 |             if metric_type == "val":
236 |                 atf_loss = metric[:, 0]
237 |                 folder = f"mean_val"
238 |             else:
239 |                 atf_loss = np.mean(metric[:, :, 0], axis=0) if self.mf_handle == "mean" else np.sum(metric[:, :, 0], axis=0)
240 |                 folder = f"current_step" if metric_type != "total" else f"mean_train"
241 | 
242 |             if self.total_loss_crit == "sum":
243 |                 total_loss = np.sum(atf_loss, axis=0)
244 |             elif self.total_loss_crit == "mean":
245 |                 total_loss = np.mean(atf_loss, axis=0)
246 |             elif self.total_loss_crit == "type":
247 |                 total_loss = atf_loss[1 if self.outputs_map["sigma"] else 0]
248 | 
249 |             log = f"\tTotal loss of all features:\t{total_loss}"
250 |             print(log)
251 |             writer.add_scalar(f"{folder}/total_loss", total_loss, step)
252 | 
253 |             if metric_type == "total":
254 |                 self.mean_epoch_train_loss = total_loss
255 | 
256 |         def spat_loss_log(metric, metric_type="total"):
257 |             if metric_type == "val":
258 |                 spat_loss, r2 = metric[0, 0], metric[0, 1]
259 |                 folder, log = f"mean_val", f"\tGeospatial {spatial} loss:\t{spat_loss}\tCoord R2:\t{r2}"
260 |             else:
261 |                 spat_loss, r2 = np.mean(metric[:, 0, 0], axis=0) if self.mf_handle == "mean" else np.sum(metric[:, 0, 0], axis=0), None
262 |                 folder, log = f"current_step", f"\tGeospatial {self.mf_handle} {spatial} loss:\t{spat_loss}"
263 |                 if metric_type == "total":
264 |                     r2 = metric[0, 0, 1]
265 |                     folder = f"mean_train"
266 |                     log += f"\tCoord R2:\t{r2}"
267 | 
268 |                 if metric.shape[0] > 1:
269 |                     key_spat, minor_spat = metric[0, 0, 0], np.mean(metric[1:, 0, 0], axis=0)
270 |                     log += f"\n\t\tKey:\t{key_spat}\tMinor:\t{minor_spat}"
271 |                     writer.add_scalar(f"{folder}/spat_key", key_spat, step)
272 |                     writer.add_scalar(f"{folder}/spat_minor", minor_spat, step)
273 | 
274 |             print(log)
275 |             writer.add_scalar(f"{folder}/loss_spat", spat_loss, step)
276 |             if r2:
277 |                 writer.add_scalar(f"{folder}/r2", r2, step)
278 | 
279 |         def prob_loss_log(metric, metric_type="total"):
280 |             if metric_type == "val":
281 |                 prob_loss, pdf = metric[1, 0], metric[1, 1]
282 |                 folder, log = f"mean_val", f"\tProbabilistic {self.mf_handle} -LLH loss:\t{prob_loss}\tPDF:\t{pdf}"
283 |             else:
284 |                 prob_loss, pdf = np.mean(metric[:, 1, 0], axis=0) if self.mf_handle == "mean" else np.sum(metric[:, 1, 0], axis=0), None
285 |                 folder, log = f"current_step", f"\tProbabilistic {self.mf_handle} -LLH loss:\t{prob_loss}"
286 |                 if metric_type == "total":
287 |                     pdf = metric[0, 1, 1]
288 |                     folder = f"mean_train"
289 |                     log += f"\tPDF:\t{pdf}"
290 | 
291 |                 if metric.shape[0] > 1:
292 |                     key_prob, minor_prob = metric[0, 1, 0], np.mean(metric[1:, 1, 0], axis=0)
293 |                     log += f"\n\t\tKey:\t{key_prob}\tMinor:\t{minor_prob}"
294 |                     writer.add_scalar(f"{folder}/prob_key", key_prob, step)
295 |                     writer.add_scalar(f"{folder}/prob_minor", minor_prob, step)
296 | 
297 |             print(log)
298 |             writer.add_scalar(f"{folder}/loss_prob", prob_loss, step)
299 |             if pdf:
300 |                 writer.add_scalar(f"{folder}/pdf", pdf, step)
301 | 
302 |         spatial = 'D^2' if self.dist else 'Coord'
303 |         processed_metric = train_metric[0:cur_batch, :]
304 |         mean_metric = np.mean(processed_metric, axis=0)
305 |         current_batch_metric = processed_metric[-1, :]
306 |         # print(current_batch_metric)
307 | 
308 |         print(f"LOG\tCurrent step: {step}\tLR:\t{lr}")
309 |         total_loss_log(current_batch_metric, "current")
310 |         spat_loss_log(current_batch_metric, "current")
311 |         if self.outputs_map["sigma"]:
312 |             prob_loss_log(current_batch_metric, "current")
313 | 
314 |         print(f"LOG\tTRAIN\tMean metrics:")
315 |         total_loss_log(mean_metric, "total")
316 |         spat_loss_log(mean_metric, "total")
317 |         if self.outputs_map["sigma"]:
318 |             prob_loss_log(mean_metric, "total")
319 | 
320 |         if val_metric is not None:
321 |             mean_val_metric = np.mean(val_metric, axis=0)
322 | 
323 |             print(f"LOG\tVAL\tMean metrics:")
324 |             total_loss_log(mean_val_metric, "val")
325 |             spat_loss_log(mean_val_metric, "val")
326 |             if self.outputs_map["sigma"]:
327 |                 prob_loss_log(mean_val_metric, "val")
328 | 
329 |         writer.flush()
330 | 


--------------------------------------------------------------------------------
/utils/cosine_scheduler.py:
--------------------------------------------------------------------------------
  1 | from collections.abc import Iterable
  2 | from math import log, cos, pi, floor
  3 | 
  4 | from torch.optim.lr_scheduler import _LRScheduler
  5 | 
  6 | # cosine scheduler fpr learning rate during training
  7 | # source code: https://github.com/abhuse/cyclic-cosine-decay
  8 | # author: @abhuse
  9 | 
 10 | 
 11 | class CyclicCosineDecayLR(_LRScheduler):
 12 |     def __init__(self,
 13 |                  optimizer,
 14 |                  init_decay_epochs,
 15 |                  min_decay_lr,
 16 |                  restart_interval=None,
 17 |                  restart_interval_multiplier=None,
 18 |                  restart_lr=None,
 19 |                  warmup_epochs=None,
 20 |                  warmup_start_lr=None,
 21 |                  last_epoch=-1,
 22 |                  verbose=False):
 23 |         """
 24 |         Initialize new CyclicCosineDecayLR object.
 25 |         :param optimizer: (Optimizer) - Wrapped optimizer.
 26 |         :param init_decay_epochs: (int) - Number of initial decay epochs.
 27 |         :param min_decay_lr: (float or iterable of floats) - Learning rate at the end of decay.
 28 |         :param restart_interval: (int) - Restart interval for fixed cycles.
 29 |             Set to None to disable cycles. Default: None.
 30 |         :param restart_interval_multiplier: (float) - Multiplication coefficient for geometrically increasing cycles.
 31 |             Default: None.
 32 |         :param restart_lr: (float or iterable of floats) - Learning rate when cycle restarts.
 33 |             If None, optimizer's learning rate will be used. Default: None.
 34 |         :param warmup_epochs: (int) - Number of warmup epochs. Set to None to disable warmup. Default: None.
 35 |         :param warmup_start_lr: (float or iterable of floats) - Learning rate at the beginning of warmup.
 36 |             Must be set if warmup_epochs is not None. Default: None.
 37 |         :param last_epoch: (int) - The index of the last epoch. This parameter is used when resuming a training job. Default: -1.
 38 |         :param verbose: (bool) - If True, prints a message to stdout for each update. Default: False.
 39 |         """
 40 | 
 41 |         if not isinstance(init_decay_epochs, int) or init_decay_epochs < 1:
 42 |             raise ValueError("init_decay_epochs must be positive integer, got {} instead".format(init_decay_epochs))
 43 | 
 44 |         if isinstance(min_decay_lr, Iterable) and len(min_decay_lr) != len(optimizer.param_groups):
 45 |             raise ValueError("Expected len(min_decay_lr) to be equal to len(optimizer.param_groups), "
 46 |                              "got {} and {} instead".format(len(min_decay_lr), len(optimizer.param_groups)))
 47 | 
 48 |         if restart_interval is not None and (not isinstance(restart_interval, int) or restart_interval < 1):
 49 |             raise ValueError("restart_interval must be positive integer, got {} instead".format(restart_interval))
 50 | 
 51 |         if restart_interval_multiplier is not None and \
 52 |                 (not isinstance(restart_interval_multiplier, float) or restart_interval_multiplier <= 0):
 53 |             raise ValueError("restart_interval_multiplier must be positive float, got {} instead".format(
 54 |                 restart_interval_multiplier))
 55 | 
 56 |         if isinstance(restart_lr, Iterable) and len(restart_lr) != len(optimizer.param_groups):
 57 |             raise ValueError("Expected len(restart_lr) to be equal to len(optimizer.param_groups), "
 58 |                              "got {} and {} instead".format(len(restart_lr), len(optimizer.param_groups)))
 59 | 
 60 |         if warmup_epochs is not None:
 61 |             if not isinstance(warmup_epochs, int) or warmup_epochs < 1:
 62 |                 raise ValueError(
 63 |                     "Expected warmup_epochs to be positive integer, got {} instead".format(type(warmup_epochs)))
 64 | 
 65 |             if warmup_start_lr is None:
 66 |                 raise ValueError("warmup_start_lr must be set when warmup_epochs is not None")
 67 | 
 68 |             if not (isinstance(warmup_start_lr, float) or isinstance(warmup_start_lr, Iterable)):
 69 |                 raise ValueError("warmup_start_lr must be either float or iterable of floats, got {} instead".format(
 70 |                     warmup_start_lr))
 71 | 
 72 |             if isinstance(warmup_start_lr, Iterable) and len(warmup_start_lr) != len(optimizer.param_groups):
 73 |                 raise ValueError("Expected len(warmup_start_lr) to be equal to len(optimizer.param_groups), "
 74 |                                  "got {} and {} instead".format(len(warmup_start_lr), len(optimizer.param_groups)))
 75 | 
 76 |         group_num = len(optimizer.param_groups)
 77 |         self._warmup_start_lr = [warmup_start_lr] * group_num if isinstance(warmup_start_lr, float) else warmup_start_lr
 78 |         self._warmup_epochs = 0 if warmup_epochs is None else warmup_epochs
 79 |         self._init_decay_epochs = init_decay_epochs
 80 |         self._min_decay_lr = [min_decay_lr] * group_num if isinstance(min_decay_lr, float) else min_decay_lr
 81 |         self._restart_lr = [restart_lr] * group_num if isinstance(restart_lr, float) else restart_lr
 82 |         self._restart_interval = restart_interval
 83 |         self._restart_interval_multiplier = restart_interval_multiplier
 84 |         super(CyclicCosineDecayLR, self).__init__(optimizer, last_epoch, verbose=verbose)
 85 | 
 86 |     def get_lr(self):
 87 | 
 88 |         if self._warmup_epochs > 0 and self.last_epoch < self._warmup_epochs:
 89 |             return self._calc(self.last_epoch,
 90 |                               self._warmup_epochs,
 91 |                               self._warmup_start_lr,
 92 |                               self.base_lrs)
 93 | 
 94 |         elif self.last_epoch < self._init_decay_epochs + self._warmup_epochs:
 95 |             return self._calc(self.last_epoch - self._warmup_epochs,
 96 |                               self._init_decay_epochs,
 97 |                               self.base_lrs,
 98 |                               self._min_decay_lr)
 99 |         else:
100 |             if self._restart_interval is not None:
101 |                 if self._restart_interval_multiplier is None:
102 |                     cycle_epoch = (self.last_epoch - self._init_decay_epochs - self._warmup_epochs) % self._restart_interval
103 |                     lrs = self.base_lrs if self._restart_lr is None else self._restart_lr
104 |                     return self._calc(cycle_epoch,
105 |                                       self._restart_interval,
106 |                                       lrs,
107 |                                       self._min_decay_lr)
108 |                 else:
109 |                     n = self._get_n(self.last_epoch - self._warmup_epochs - self._init_decay_epochs)
110 |                     sn_prev = self._partial_sum(n)
111 |                     cycle_epoch = self.last_epoch - sn_prev - self._warmup_epochs - self._init_decay_epochs
112 |                     interval = self._restart_interval * self._restart_interval_multiplier ** n
113 |                     lrs = self.base_lrs if self._restart_lr is None else self._restart_lr
114 |                     return self._calc(cycle_epoch,
115 |                                       interval,
116 |                                       lrs,
117 |                                       self._min_decay_lr)
118 |             else:
119 |                 return self._min_decay_lr
120 | 
121 |     def _calc(self, t, T, lrs, min_lrs):
122 |         return [min_lr + (lr - min_lr) * ((1 + cos(pi * t / T)) / 2)
123 |                 for lr, min_lr in zip(lrs, min_lrs)]
124 | 
125 |     def _get_n(self, epoch):
126 |         _t = 1 - (1 - self._restart_interval_multiplier) * epoch / self._restart_interval
127 |         return floor(log(_t, self._restart_interval_multiplier))
128 | 
129 |     def _partial_sum(self, n):
130 |         return self._restart_interval * (1 - self._restart_interval_multiplier ** n) / (
131 |                     1 - self._restart_interval_multiplier)
132 | 


--------------------------------------------------------------------------------
/utils/prediction.py:
--------------------------------------------------------------------------------
 1 | from transformers import BertTokenizer
 2 | import torch
 3 | import numpy as np
 4 | from pathlib import Path
 5 | from utils.twitter_dataset import *
 6 | from utils.result_visuals import *
 7 | 
 8 | # single text prediction wrapper
 9 | # preprocessing and result visual output
10 | class ModelOutput():
11 |     def __init__(self, wrapper, model_prefix, local=False):
12 |         self.prefix = model_prefix
13 |         self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
14 |         self.model = wrapper.model.to(self.device)
15 |         self.local = local
16 | 
17 |         if self.local:
18 |             local_model = f"models/final/{self.prefix}.pth"
19 |             print(f"LOAD\tLoading local model from {local_model}")
20 |             if not Path(local_model).is_file():
21 |                 print(f"LOAD [ERROR] Unable to load local model: file {local_model} does not exist")
22 | 
23 |             state = torch.load(local_model) if torch.cuda.is_available() else torch.load(local_model, map_location='cpu')
24 |             self.model.load_state_dict(state['model_state_dict'])
25 | 
26 |         self.outcomes = wrapper.n_outcomes
27 |         self.cov = wrapper.cov
28 |         self.weighted = wrapper.weighted
29 |         self.feature = wrapper.features[0]
30 |         self.tokenizer = BertTokenizer.from_pretrained(wrapper.original_model)
31 |         self.benchmark = ModelBenchmark(wrapper, True, "pos", "mean", "mean" if self.cov else "type")
32 | 
33 |         self.result = None
34 |         self.visual = None
35 | 
36 |     def prediction_output(self, text, filtering=True, visual=False):
37 |         if filtering:
38 |             text = nlp_filtering(text)
39 |             print(f"TEXT\tFiltered text: {text}")
40 | 
41 |         self.result = ResultManager(None, text, self.feature, self.device, self.benchmark, False, False, self.prefix)
42 | 
43 |         if self.local:
44 |             print("TEXT\tTokenizing text to input IDs and attention masks")
45 |             encoded_corpus = self.tokenizer(text=text,
46 |                                             add_special_tokens=True,
47 |                                             padding='max_length',
48 |                                             truncation='longest_first',
49 |                                             max_length=300,
50 |                                             return_attention_mask=True)
51 |             input_id = encoded_corpus['input_ids']
52 |             attention_mask = encoded_corpus['attention_mask']
53 | 
54 |             input = torch.tensor(input_id).to(self.device).reshape(1, -1)
55 |             mask = torch.tensor(attention_mask).to(self.device).reshape(1, -1)
56 | 
57 |             self.model.eval()
58 |             with torch.no_grad():
59 |                 output = self.model(input, mask, self.feature)
60 | 
61 |                 if self.cov:
62 |                     prob_model = self.benchmark.prob_models(output)
63 | 
64 |                 output = output.cpu().numpy() if torch.cuda.is_available() else output.numpy()
65 | 
66 |             print(f"RESULT\tPost-processing raw model outputs: {output}")
67 |             self.result.soft_outputs(list([prob_model])) if self.cov else self.result.coord_outputs(output)
68 | 
69 |         else:
70 |             print("TEXT\tTokenizing text to input IDs and attention masks")
71 |             inputs = self.tokenizer(text, return_tensors="pt")
72 | 
73 |             with torch.no_grad():
74 |                 output = self.model(**inputs)
75 |                 prob_model = self.benchmark.prob_models(output)
76 | 
77 |             print(f"RESULT\tPost-processing raw model outputs: {output}")
78 |             self.result.soft_outputs(list([prob_model]))
79 | 
80 |         if visual:
81 |             self.visual = ResultVisuals(self.result)
82 |             self.visual.text_map_result()
83 | 
84 |         return self.result
85 | 


--------------------------------------------------------------------------------
/utils/regressor.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from transformers import BertModel, BertPreTrainedModel, BertConfig
  3 | 
  4 | # general model wrapper
  5 | # linear regression fork for features and preset outputs
  6 | class BERTregModel():
  7 |     def __init__(self, n_outcomes=1, covariance=None, weighted=False, features=None, base_model_name=None, hub_model=None):
  8 |         self.n_outcomes = n_outcomes
  9 |         self.cov = covariance
 10 |         self.weighted = weighted
 11 |         self.features = ["NON-GEO"] if features is None else features
 12 | 
 13 |         print(f"MODEL\tInitializing BERT Regression model for {self.n_outcomes} outcome(s)")
 14 |         # features
 15 |         print(f"MODEL\tText features:\t{' + '.join(self.features)}")
 16 |         # longitude, latitude for n outcomes
 17 |         self.coord_output = self.n_outcomes * 2
 18 |         print(f"MODEL\tCoordinates:\t{self.coord_output}")
 19 |         # weights of gaussians
 20 |         self.weights_output = self.n_outcomes if self.weighted and self.n_outcomes > 1 else 0
 21 |         if self.weights_output > 0:
 22 |             print(f"MODEL\tWeights:\t{self.weights_output}")
 23 | 
 24 |         # covariance matrix
 25 |         self.covariances = {'spher': self.n_outcomes,
 26 |                             'diag': self.n_outcomes * 2,
 27 |                             'tied': 3,
 28 |                             'full': self.n_outcomes * 3}
 29 |         if self.cov is None:
 30 |             self.cov_output = 0
 31 |             print(f"MODEL\tNon-probabilistic model has been chosen")
 32 |         else:
 33 |             if self.cov not in self.covariances:
 34 |                 self.cov = 'spher'
 35 |             self.cov_output = self.covariances[self.cov]
 36 |             print(f"MODEL\tCovariances:\t{self.cov_output}\tmatrix type:\t{self.cov}")
 37 | 
 38 |         self.original_model = "bert-base-multilingual-cased" if base_model_name is None else base_model_name
 39 |         print(f"MODEL\tOriginal model to load:\t{self.original_model}")
 40 | 
 41 |         self.key_output = self.coord_output + self.weights_output + self.cov_output
 42 |         self.minor_output = 2
 43 |         self.minor_output += 1 if self.cov_output > 0 else 0
 44 | 
 45 |         self.feature_outputs = {}
 46 |         for f in range(len(self.features)):
 47 |             if f == 0:
 48 |                 output = self.key_output
 49 |                 print(f"MODEL\tKey feature \t{self.features[f]} outputs:\t{output}")
 50 |             else:
 51 |                 output = self.minor_output
 52 |                 print(f"MODEL\tMinor feature\t{self.features[f]} outputs:\t{output}")
 53 |             self.feature_outputs[self.features[f]] = output
 54 | 
 55 |         if hub_model:
 56 |             self.model = GeoBertModel(BertConfig.from_pretrained(self.original_model), self.feature_outputs)
 57 |             print(f"LOAD\tLoading HF model from {hub_model}")
 58 |             self.model = self.model.from_pretrained(hub_model, self.feature_outputs)
 59 |         else:
 60 |             self.model = BertRegressor(self.original_model, self.feature_outputs)
 61 | 
 62 | 
 63 | 
 64 | # Train model wrapper layer
 65 | class BertRegressor(nn.Module):
 66 |     def __init__(self, model_name, feature_outputs):
 67 |         super(BertRegressor, self).__init__()
 68 |         self.bert = BertModel.from_pretrained(model_name, return_dict=True)
 69 |         self.feature_outputs = feature_outputs
 70 | 
 71 |         self.key_regressor = nn.Linear(768, list(self.feature_outputs.values())[0])
 72 |         if len(self.feature_outputs) > 1:
 73 |             self.minor_regressor = nn.Linear(768, list(self.feature_outputs.values())[1])
 74 | 
 75 |     def forward(self, input_ids, attention_masks, feature_name):
 76 |         outputs = self.bert(input_ids, attention_masks)
 77 |         if feature_name == list(self.feature_outputs.keys())[0]:
 78 |             outputs = self.key_regressor(outputs[1])
 79 |         else:
 80 |             outputs = self.minor_regressor(outputs[1])
 81 |         return outputs
 82 | 
 83 | 
 84 | # HF model wrapper layer
 85 | class GeoBertModel(BertPreTrainedModel):
 86 |     def __init__(self, config, feature_outputs):
 87 |         super().__init__(config)
 88 |         self.bert = BertModel(config)
 89 |         self.feature_outputs = feature_outputs
 90 | 
 91 |         self.key_regressor = nn.Linear(config.hidden_size, list(self.feature_outputs.values())[0])
 92 |         if len(self.feature_outputs) > 1:
 93 |             self.minor_regressor = nn.Linear(config.hidden_size, list(self.feature_outputs.values())[1])
 94 | 
 95 |     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, feature_name=None):
 96 |         outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask)
 97 |         pooler_output = outputs[1]
 98 |         if feature_name is None or feature_name == list(self.feature_outputs.keys())[0]:
 99 |             custom_output = self.key_regressor(pooler_output)
100 |         else:
101 |             custom_output = self.minor_regressor(pooler_output)
102 |         return custom_output
103 | 


--------------------------------------------------------------------------------
/utils/twitter_dataset.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | import torch
  5 | from torch.utils.data import TensorDataset, DataLoader
  6 | 
  7 | import string
  8 | import re
  9 | import os
 10 | import datetime
 11 | 
 12 | from sklearn.model_selection import train_test_split
 13 | 
 14 | import GPUtil
 15 | import psutil
 16 | 
 17 | # dataset wrapper
 18 | 
 19 | 
 20 | def load_jsonl(filename):
 21 |     filename = f"datasets/{filename}"
 22 |     print(f"DATASET\tLOAD: {psutil.virtual_memory().percent}%\tLoading dataset from {filename}")
 23 |     data = pd.read_json(path_or_buf=filename, lines=True)
 24 |     print(f"DATASET\tLOAD: {psutil.virtual_memory().percent}%\tDataset of {len(data.index)} samples and {len(data.user.unique())} users is loaded")
 25 |     return data
 26 | 
 27 | 
 28 | def save_df(df, filename, prefix=None):
 29 |     if prefix is None:
 30 |         prefix = "td-"
 31 | 
 32 |     size = len(df.index)
 33 |     save_filename = f"datasets/{prefix}{size}-{filename}"
 34 | 
 35 |     with open(save_filename, "w") as f:
 36 |         df.to_json(f, orient='records', lines=True)
 37 |     print(f"DATASET\tSAVE\tTwitter Dataset of {size} samples is written to file: {save_filename}")
 38 | 
 39 | 
 40 | def combine_datasets(file_list, filename, prefix=None):
 41 |     if prefix is None:
 42 |         prefix = "td-"
 43 | 
 44 |     df  = load_jsonl(file_list[0])
 45 |     for file in file_list[1:]:
 46 |         data = load_jsonl(file)
 47 |         df = df.append(data, ignore_index=True)
 48 | 
 49 |     save_df(df, filename, prefix)
 50 | 
 51 | 
 52 | def filter_bots(data, min_total=1, max_day=20):
 53 |     print(f"DATASET\tFiltering dataset of {len(data['user'].unique())} users from bots posting more than {max_day} tweets per day")
 54 |     if "time" in data.columns:
 55 |         data['date'] = pd.to_datetime(data['time'], utc=False).dt.date
 56 |         user_tweets_per_day = data.groupby(['date'])['user'].value_counts()
 57 |         user_tweets = user_tweets_per_day[user_tweets_per_day < max_day].droplevel(0).groupby(["user"]).sum()
 58 |     else:
 59 |         user_tweets = data['user'].value_counts()
 60 | 
 61 |     # data["time"] = data['time'].apply(lambda x: datetime.datetime.combine(x, datetime.time.min).timestamp())
 62 |     data['date'] = data['time'].apply(lambda x: datetime.datetime.strptime(x, '%a %b %d %H:%M:%S %z %Y').timestamp())
 63 |     # data.drop("date", axis=1, inplace=True)
 64 |     data["date"] = data["date"].astype(float)
 65 | 
 66 |     user_list = user_tweets[user_tweets > min_total].index.tolist()
 67 |     data = data[data['user'].isin(user_list)]
 68 | 
 69 |     print(f"DATASET\tSize of the filtered dataset with {len(data['user'].unique())} users: {len(data.index)} samples")
 70 |     return data
 71 | 
 72 | 
 73 | # text preprocessing
 74 | def nlp_filtering(text):
 75 |     def filter_punctuation(text):
 76 |         punctuationfree="".join([i for i in text if i not in string.punctuation])
 77 |         return punctuationfree
 78 | 
 79 |     def filter_websites(text):
 80 |         #pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*'
 81 |         pattern = r'http\S+'
 82 |         text = re.sub(pattern, '', text)
 83 |         return text
 84 | 
 85 |     text = filter_websites(text)
 86 |     text = filter_punctuation(text)
 87 |     return text
 88 | 
 89 | 
 90 | def create_dataloader(inputs, masks, labels, batch_size, shuffle=False):
 91 |     dataset = TensorDataset(torch.tensor(inputs), torch.tensor(masks), torch.tensor(labels))
 92 |     return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
 93 | 
 94 | 
 95 | # crop dataset to remove already used data before random sampling
 96 | def crop_dataset(data, size, seed, by_user=False, ref_file=None, save=False):
 97 |     print(f"DATASET\tCropping dataset of {len(data.index)} in {size} samples with seed {seed}{' including uniqie users' if by_user else ''}")
 98 |     if by_user:
 99 |         if ref_file:
100 |             df = load_jsonl(ref_file)
101 |             crop_data = df.sample(n=size, random_state=seed)
102 |         else:
103 |             crop_data = data.sample(n=size, random_state=seed)
104 |             data = data.drop(crop_data.index, axis=0)
105 | 
106 |         crop_users = crop_data["user"].unique()
107 | 
108 |         print(f"DATASET\tUnique users to crop: {len(crop_users)}")
109 |         data = data[-data["user"].isin(crop_users)]
110 |         print(f"DATASET\tUnique users left: {len(data['user'].unique())}")
111 |     else:
112 |         crop_data = data.sample(n=size, random_state=seed)
113 |         data = data.drop(crop_data.index, axis=0)
114 | 
115 |     if save and ref_file:
116 |         save_df(crop_data, ref_file, "train-")
117 | 
118 |     print(f"DATASET\tReduced dataset lenght is {len(data.index)}")
119 |     return data
120 | 
121 | 
122 | # sample users (bots filtering) for evaluation
123 | def sample_users(data, users_n, seed=42):
124 |     user_tweets = data['user'].value_counts()
125 |     user_list = user_tweets.sample(n=users_n, random_state=seed).index.tolist()
126 |     return data[data['user'].isin(user_list)]
127 | 
128 | 
129 | class TwitterDataloader():
130 |     def __init__(self, filename, features, target, tokenizer, seed=42, scaled=False, val_feature=None, bot_filter=False):
131 |         self.filename = filename
132 |         self.data = load_jsonl(self.filename)
133 |         if "texts" in self.data.columns:
134 |             self.data.rename(columns={'longitude': 'lon',
135 |                                       'latitude': 'lat',
136 |                                       'created_at': 'time',
137 |                                       'texts': 'text'}, inplace=True)
138 |             self.data.to_json(f"datasets/{self.filename}", orient='records', lines=True)
139 |             print(self.data.info())
140 | 
141 |         if bot_filter:
142 |             self.data = filter_bots(self.data)
143 |             save_df(self.data, self.filename, "filtered-")
144 | 
145 |         self.target = target
146 | 
147 |         self.tokenizer = tokenizer
148 |         self.seed = seed
149 | 
150 |         self.scaled = scaled
151 | 
152 |         self.feature_columns = {
153 |             "GEO": ["text", "place", "user"],
154 |             "NON-GEO": ["text", "user"],
155 |             "NON-USER": ["text", "place"],
156 |             "META-DATA": ["place", "user"],
157 |             "TEXT-ONLY": ["text"],
158 |             "GEO-ONLY": ["place"],
159 |             "USER-ONLY": ["user"],
160 |         }
161 | 
162 |         self.features = features
163 |         self.n_features = len(features)
164 |         self.key_feature = features[0]
165 |         self.minor_features = features[1:]
166 | 
167 |         self.val_feature = features[0] if val_feature is None else val_feature
168 | 
169 |         self.val_dataloader, self.train_dataloader, self.test_dataloader = None, None, None
170 |         self.val_df = None
171 | 
172 |     # filtering dataset files by column condition
173 |     def filter_dataset(self, column_name, filter_text=None, filter_list=None, save=True):
174 |         if filter_text:
175 |             filter_df = self.data[self.data[column_name] == filter_text]
176 |             prefix = f"f-{column_name}-{filter_text}-td-"
177 |         elif filter_list:
178 |             filter_df = self.data[self.data[column_name].isin(filter_list)]
179 |             prefix = f"f-{column_name}-{'+'.join(filter_list)}-td-"
180 | 
181 |         if save:
182 |             save_df(filter_df, self.filename, prefix)
183 | 
184 |     # tokenization - text to IDs and attention masks
185 |     def tokenize(self, column):
186 |         encoded_corpus = self.tokenizer(text=column,
187 |                                         add_special_tokens=True,
188 |                                         padding='max_length',
189 |                                         truncation='longest_first',
190 |                                         max_length=300,
191 |                                         return_attention_mask=True)
192 | 
193 |         input_ids = encoded_corpus['input_ids']
194 |         attention_masks = encoded_corpus['attention_mask']
195 |         return input_ids, attention_masks
196 | 
197 |     # forming feature columns, dropping old columns
198 |     def feature_split_filter(self, data):
199 |         text_features = self.features + [self.val_feature] if self.val_feature not in self.features else self.features
200 |         for f in text_features:
201 |             data[f] = data[self.feature_columns[f]].astype(str).agg(" ".join, axis=1) if len(self.feature_columns[f]) > 1 else data[self.feature_columns[f]]
202 |             data[f] = data[f].astype(str).apply(nlp_filtering)
203 | 
204 |         for f in text_features:
205 |             for column in self.feature_columns[f]:
206 |                 if column in data.columns:
207 |                     data = data.drop(columns=[column], axis=1)
208 |         return data
209 | 
210 |     def form_training(self, batch_size, size, test_ratio, skip_size=0, shuffle=True):
211 |         if skip_size > 0:
212 |             self.data = crop_dataset(self.data, skip_size, self.seed)
213 | 
214 |         print(f"DATASET\tForming training dataset of {size} samples with test size {int(test_ratio*size)} for features: {', '.join(self.features)}")
215 |         train_df = self.data.sample(n=size, random_state=self.seed).copy()
216 |         del self.data
217 |         train_df = self.feature_split_filter(train_df)
218 |         self.create_feature_dataloaders(train_df, True, batch_size, shuffle, test_ratio)
219 | 
220 |     def form_validation(self, batch_size, size, by_user=False, skip_size=0, ref_train_file=None):
221 |         if skip_size > 0:
222 |             self.data = crop_dataset(self.data, skip_size, self.seed, by_user, ref_train_file, True)
223 | 
224 | #        self.data = filter_bots(self.data)
225 |         save_df(self.data, self.filename, "test-")
226 | 
227 |         print(f"DATASET\tForming validation dataset of {size} {'users' if by_user else 'samples'} with batch size {batch_size} for {self.val_feature} text feature")
228 |         if by_user:
229 |             self.val_df = sample_users(self.data, size, seed=self.seed).copy()
230 |             self.features += ["USER-ONLY"]
231 |         else:
232 |             self.val_df = self.data.sample(n=size, random_state=self.seed).copy()
233 |         del self.data
234 | 
235 |         print(f"DATASET\tSize of the validation dataset with {len(self.val_df['user'].unique())} users: {len(self.val_df.index)} samples")
236 | 
237 |         self.val_df = self.feature_split_filter(self.val_df)
238 |         self.create_feature_dataloaders(self.val_df, False, batch_size)
239 | 
240 |     # training and evaluation dataloaders formation
241 |     def create_feature_dataloaders(self, df, train, batch_size, shuffle=False, test_ratio=None):
242 |         if train:
243 |             train_index, test_index = train_test_split(df.index, test_size=test_ratio, random_state=self.seed)
244 |             train_index, test_index = list(train_index), list(test_index)
245 |             train_size, test_size = len(train_index), len(test_index)
246 |             train_inputs, train_masks = np.empty((train_size, 0)), np.empty((train_size, 0))
247 | 
248 |             for feature in self.features:
249 |                 input_ids, attention_mask = self.tokenize(df.loc[train_index, feature].tolist())
250 |                 train_inputs, train_masks = np.concatenate((train_inputs, input_ids), axis=1), np.concatenate((train_masks, attention_mask), axis=1)
251 | 
252 |             test_inputs, test_masks = self.tokenize(df.loc[test_index, self.val_feature].tolist())
253 |             if self.scaled:
254 |                 train_labels = np.reshape(np.multiply(df.loc[train_index, self.target].to_numpy(), 0.01), (train_size, 2))
255 |                 test_labels = np.reshape(np.multiply(df.loc[test_index, self.target].to_numpy(), 0.01), (test_size, 2))
256 |             else:
257 |                 train_labels = np.reshape(df.loc[train_index, self.target].to_numpy(), (train_size, 2))
258 |                 test_labels = np.reshape(df.loc[test_index, self.target].to_numpy(), (test_size, 2))
259 | 
260 |             self.train_dataloader = create_dataloader(np.reshape(train_inputs, (train_size, self.n_features, 300)),
261 |                                                       np.reshape(train_masks, (train_size, self.n_features, 300)), train_labels, batch_size, shuffle)
262 |             self.test_dataloader = create_dataloader(test_inputs, test_masks, test_labels, batch_size, shuffle)
263 |             del df
264 |         else:
265 |             labels = df[self.target].to_numpy()
266 |             if self.scaled:
267 |                 labels = np.multiply(labels, 0.01)
268 |             val_inputs, val_masks = self.tokenize(df[self.val_feature].tolist())
269 | 
270 |             self.val_dataloader = create_dataloader(val_inputs, val_masks, labels, batch_size, shuffle)
271 | 


--------------------------------------------------------------------------------
/valid_data.py:
--------------------------------------------------------------------------------
 1 | from utils.result_visuals import *
 2 | from utils.regressor import *
 3 | import torch
 4 | 
 5 | # results manager and visual test on evaluated datasets
 6 | ww = "bert-base-multilingual-cased"
 7 | us = "bert-base-cased"
 8 | 
 9 | feature = "NON-GEO"
10 | file = f"U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2023-02-19"
11 | 
12 | input_pred = f"results/val-data/{file}.jsonl"
13 | 
14 | # output_pred = f"results/val-data/{file}-out.jsonl"
15 | # output_map_point = f"img/map-test-{feature}.png"
16 | # output_map_line = f"img/map-test-{feature}.png"
17 | # output_dist = f"img/dist-test-{feature}.png"
18 | 
19 | if torch.cuda.is_available():
20 |     device = torch.device("cuda")
21 |     print(f"Available GPU has {torch.cuda.device_count()} devices, using {torch.cuda.get_device_name(0)}")
22 | else:
23 |     print(f"No GPU available, using the CPU with {torch.get_num_threads()} threads instead.")
24 |     device = torch.device("cpu")
25 | 
26 | bert_wrapper = BERTregModel(n_outcomes=1, covariance="spher", weighted=False, features=["NON-GEO", "GEO-ONLY"], model_name=ww)
27 | model = ModelBenchmark(bert_wrapper, distance=True, loss_prob="pos", mf_loss="mean", total_loss="mean")
28 | result = ResultManager(None, None, feature, device, model, scaled=False, by_user=False, prefix=file)
29 | result.load_df(input_pred)
30 | 
31 | # metrics
32 | # result.result_metrics(True, 100)
33 | # result.result_metrics(False, 100)
34 | 
35 | result.performance()
36 | 
37 | # visual = ResultVisuals(result)
38 | 
39 | # standard
40 | # visual.density()
41 | # visual.cum_dist(False, 161)
42 | 
43 | # GMM
44 | # visual.summarize_prediction(1)
45 | # visual.gaus_map()
46 | # visual.prob_map_animation(228)
47 | 
48 | # visual.interactive_map(lines=False, best=True)
49 | 
50 | # result.save_df()
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------