├── .gitignore ├── README.md ├── collector.py ├── datasets ├── 200-gmm.jsonl └── bgmm-p200-c12057022.jsonl ├── input_entry.py ├── requirements.txt ├── results ├── img │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png │ ├── cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png │ ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png │ ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png │ ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png │ ├── density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png │ ├── density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png │ ├── gmm_contour_plot.png │ ├── gmm_likelihood_world.png │ ├── gmm_user_summary_S300_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png │ ├── gmm_user_summary_S38_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N218_VF-NON-GEO_2022-11-24_N218_2022-11-24.png │ ├── gmm_user_summary_S500_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png │ ├── text_map_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png │ ├── text_map_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png │ ├── text_map_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png │ └── text_map_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png ├── metric │ ├── EF-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-01-25.txt │ ├── EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-NON-GEO_2023-02-09.txt │ ├── EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-TEXT-ONLY_2023-02-09.txt │ ├── U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt │ ├── U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-29.txt │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-22.txt │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2023-02-19_metric_N300000_VF-NON-GEO_2023-02-21.txt │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-24_metric_N300000_VF-NON-GEO_2022-11-24.txt │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt │ ├── U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt │ ├── U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt │ ├── U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-08.txt │ ├── U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-29_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt │ ├── U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt │ ├── U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt │ ├── U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt │ ├── U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-19.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-05.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-05.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N100_VF-NON-GEO_2022-11-17.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-20.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-24.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-24.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-NON-GEO_2023-02-24.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-TEXT-ONLY_2023-02-24.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100_VF-NON-GEO_2022-11-17_metric_N100_VF-NON-GEO_2022-11-22.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-23.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-23.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-23.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-24.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-10-31_metric_N300000_VF-NON-GEO_2022-11-22.txt │ ├── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt │ ├── U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-09.txt │ ├── U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-09.txt │ ├── U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt │ ├── U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-28.txt │ ├── U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt │ ├── U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt │ ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N0e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N50_VF-NON-GEO_2022-11-22.txt │ ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-23.txt │ ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-23.txt │ ├── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-NON-GEO_2023-02-24.txt │ └── US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-TEXT-ONLY_2023-02-24.txt └── val-data │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N1000_2022-10-25.jsonl │ └── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17.jsonl ├── runs ├── prob │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs │ │ └── events.out.tfevents.1665082080.gpu127.3070880.0 │ └── U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs │ │ └── events.out.tfevents.1665062385.gpu118.2632089.0 └── spat │ ├── U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-09_logs │ └── events.out.tfevents.1665313776.gpu113.3164054.0 │ └── U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-23_logs │ └── events.out.tfevents.1666523343.gpu148.1960620.0 ├── supplementary_resources ├── article_draft.pdf ├── img │ ├── loss-graph-prob.png │ ├── loss-graph-spat.png │ ├── map-density.png │ ├── model-train.png │ ├── mop-loss.png │ ├── prediction-example.png │ ├── sop-loss.png │ └── total-loss.png └── scripts │ ├── bash │ ├── collector.sh │ ├── data.sh │ └── train.sh │ └── python │ ├── bert_train.py │ ├── camambert-test.py │ ├── coords_plots.py │ ├── data-from-test.py │ ├── dev-loss-func-test.py │ ├── geotext-dataframe.py │ ├── hf_repo.py │ ├── json_split.py │ ├── loss_graph_prob.py │ ├── loss_graph_spat.py │ ├── ner-gazetteer-test.py │ ├── old_project_examples │ ├── extract_tweets_with_smileys.py │ ├── train_sentiment_classifier.py │ └── tweet_utils.py │ └── transformers-tutorial-test.py ├── text_result.py ├── train_bert.py ├── utils ├── benchmarks.py ├── cosine_scheduler.py ├── model_trainer.py ├── prediction.py ├── regressor.py ├── result_manager.py ├── result_visuals.py └── twitter_dataset.py └── valid_data.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | 3 | .idea/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # pytype static type analyzer 139 | .pytype/ 140 | 141 | # Cython debug symbols 142 | cython_debug/ 143 | 144 | 145 | # ---- project specific 146 | 147 | # model files 148 | *.pth 149 | *.bin 150 | /models/hf/ 151 | 152 | # logs 153 | /runs/ 154 | 155 | # sensitive data 156 | /results/val-data/ 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Geolocation Prediction BERT model 3 | 4 | This project is aimed to solve the tweet/user geolocation prediction task and provide a flexible methodology for the geotagging of textual big data. The suggested approach implements neural networks for natural language processing (NLP) to estimate the location as coordinates (longitude, latitude) and two-dimensional Gaussian Mixture Models (GMMs). The scope of proposed models has been finetuned on a Twitter dataset using pretrained Bidirectional Encoder Representations from Transformers (BERT) as a base model. 5 | 6 | [Predicting the Geolocation of Tweets Using BERT-Based Models Trained on Customized Data](https://arxiv.org/pdf/2303.07865.pdf) - paper pre-print on arXiv 7 | 8 | [geo-bert-multilingual](https://huggingface.co/k4tel/geo-bert-multilingual) - repository on HuggingFace of the best model (Probabilistic, 5 outcomes, NON-GEO + GEO-ONLY) trained on the worldwide Twitter dataset 9 | 10 | ## Project structure 11 | 12 | - **datasets** - source folder for the input dataset files used during training and evaluation. For correct reading, the format of the files should be .jsonl containing "lon", "lat", "text", "user" and "place" columns (JSON object fields). 13 | 14 | - **models** - folder containing files of local models and checkpoints in .pth format. 15 | 16 | - **results** - folder for output files such as images, evaluated datasets, and performance metric reports. 17 | 18 | - **utils** - folder containing vital utility python classes 19 | - `benchmark.py` - loss function computation and Tensorboard log of training metrics 20 | - `cosine_scheduler.py` - [Cyclic Cosine Decay Learning Rate Scheduler](https://github.com/abhuse/cyclic-cosine-decay) 21 | - `twitter_dataset.py` - dataset wrapper class implements features forming, tokenization, and creation of PyTorch dataloaders 22 | - `regressor.py` - linear regression wrapper layer for BERT base models 23 | - `result_manager.py` - postprocessing of model outputs, writing and reading of evaluation results .jsonl files, performance metrics computation 24 | - `result_visuals.py` - visualization of results on matplotlib plots 25 | - `prediction.py` - single text prediction routine 26 | - `model_trainer.py` - training and evaluation of the models 27 | 28 | - `train_bert.py` - command line parameters input, entry point for training and evaluation 29 | - `input_entry.py` - entry point for single text prediction using local or HF repository models 30 | 31 | Additional: 32 | 33 | - **runs** - folder for storing training Tensorboard log files 34 | 35 | - **supplementary_resources** - folder containing testing and development python scripts, and bash scripts for running jobs on a cluster with slurm management system 36 | 37 | - `valid_data.py` - shortcut for results management and visualization 38 | - `collector.py` - parsing of Twitter database files to collect dataset files 39 | 40 | ## Usage/Examples 41 | 42 | To run the project locally you can clone this project with: 43 | 44 | ```bash 45 | git clone https://github.com/K4TEL/geo-twitter.git 46 | ``` 47 | 48 | Then, in your python environment run: 49 | 50 | ```bash 51 | pip install -r requirements.txt 52 | ``` 53 | 54 | ### Training 55 | 56 | **NOTE!** To run finetuning training place dataset file (.jsonl) containing "lon", "lat", "text", "user" and "place" columns (JSON object fields, no headers required) into the **datasets** folder. 57 | Then change the dataset file name in `train_bert.py` manually or by passing `-d .jsonl` argument. 58 | 59 | To launch the finetuning training with default hyperparameters run: 60 | 61 | ```bash 62 | python train_bert.py --train 63 | ``` 64 | 65 | You can change default hyperparameters manually in `train_bert.py` or pass command line arguments by using predefined flags. 66 | The list of all flags could be found in the same entry point file. 67 | 68 | In practice, learning rate, scheduler type, number of epochs, loss function parameters and target columns should remain the same. 69 | Commonly changeable parameters include number of outcomes, covariance type, features, dataset file name, training dataloader size, batch size and log step. 70 | 71 | During finetuning, training metrics and test metrics (calculated at the end of each epoch) are written to the **runs** folder. 72 | The tracking of models performance is implemented using the Tensorboard python library. 73 | Model files and their checkpoints are saved to the **models** directory automatically. 74 | 75 | ### Evaluation 76 | 77 | **NOTE!** To run evaluation place a dataset file into the **datasets** folder. 78 | And make sure you have a file of the finetuned model in .pth format in the **models** directory. 79 | 80 | To launch the evaluation with default settings run: 81 | 82 | ```bash 83 | python train_bert.py --eval 84 | ``` 85 | 86 | In this case, the model file would be chosen automatically according to the file name prefix formed from the preset hyperparameters. 87 | To pick the model manually you should adjust hyperparameters (number of outcomes, covariance type, features, loss function type) to match the previously finetuned model and run: 88 | 89 | ```bash 90 | python train_bert.py --eval -m 91 | ``` 92 | 93 | Commonly changeable parameters for the evaluation are dataset file name, validation dataloader size and model file name. 94 | 95 | To perform per user evaluation use `-vu -v ` flags that will pick N users with the highest number of samples from the dataset. 96 | In this case, performance metrics computation takes average per user values rather than average per tweet. 97 | Note that only probabilistic models using GMMs could summarize multiple per tweet predictions. 98 | 99 | The results of evaluation are written to the .jsonl dataset file containing input and output of the model. 100 | By default, performance metrics are calculated in the end and written to a short report file of .txt format. 101 | The visualization of error distance density and its cumulative distribution per outcome are drawn to .png files. 102 | 103 | Using `valid_map.py` you can read saved predictions files and use visualization functions more easily. 104 | 105 | All outputs of the evaluation are stored in the **results** folder. 106 | 107 | ### Prediction 108 | 109 | **NOTE!** To run single text prediction you should place .pth finetuned model files in the **models/final** directory. 110 | 111 | To launch the prediction with default settings run: 112 | 113 | ```bash 114 | python input_entry.py 115 | ``` 116 | Parameters like number of outcomes, probabilistic or geospatial model type, local model file and text could be specified by flags: 117 | 118 | ```bash 119 | python input_entry.py -m -t 120 | ``` 121 | 122 | ## Support 123 | 124 | For support, email lutsai.k@gmail.com 125 | -------------------------------------------------------------------------------- /collector.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import pandas as pd 4 | import os, glob 5 | 6 | country_codes = ["CA", "GB", "FR"] 7 | 8 | def parse_geo(obj): 9 | print(obj['coordinates']["coordinates"]) 10 | coord_long = obj['coordinates']["coordinates"][0] 11 | coord_lat = obj['coordinates']["coordinates"][1] 12 | return coord_long,coord_lat 13 | 14 | 15 | def parse_user(obj): 16 | location = obj['user']['location'] if obj["user"]["location"] else "" 17 | username = obj['user']['name'] if obj["user"]["name"] else "" 18 | screen = obj['user']['screen_name'] if obj["user"]["screen_name"] else "" 19 | description = obj['user']['description'] if obj["user"]["description"] else "" 20 | user = f"{username} {screen} {description} {location}" 21 | return user 22 | 23 | 24 | def parse_place(obj): 25 | full = obj['place']['full_name'] if obj['place']['full_name'] else "" 26 | country = obj['place']['country'] if obj['place']['country'] else "" 27 | code = obj['place']['country_code'] if obj['place']['full_name'] else "" 28 | name = obj['place']['name'] if obj['place']['country_code'] else "" 29 | type = obj['place']['place_type'] if obj['place']['place_type'] else "" 30 | place = f"{country} {type} {name} {full} {code}" 31 | return place, code 32 | 33 | 34 | def parse_tweet(obj): 35 | text = obj['text'] 36 | time = obj['created_at'] 37 | lang = obj['lang'] if obj['lang'] else "" 38 | return text, time, lang 39 | 40 | 41 | def parse_train(fid): 42 | known_ids = set() 43 | 44 | for line in fid: 45 | if not line: 46 | continue 47 | try: 48 | obj = json.loads(line) 49 | except (json.decoder.JSONDecodeError, TypeError): 50 | print("ERROR: entry wasn't a dictionary. skipping.", file=sys.stderr) 51 | continue 52 | 53 | try: 54 | if 'id_str' not in obj: 55 | print("ERROR: 'id' field not found in tweet", file=sys.stderr) 56 | continue 57 | if 'place' not in obj: 58 | print("ERROR: 'place' field not found in tweet", file=sys.stderr) 59 | continue 60 | if 'user' not in obj: 61 | print("ERROR: 'user' field not found in tweet", file=sys.stderr) 62 | continue 63 | if 'coordinates' not in obj: 64 | print("ERROR: 'coordinates' field not found in tweet", file=sys.stderr) 65 | continue 66 | if 'created_at' not in obj: 67 | print("ERROR: 'created_at' field not found in tweet {}".format(tweet['id']), file = sys.stderr) 68 | continue 69 | 70 | except TypeError: 71 | print("ERROR: not a dict?", line, obj, file=sys.stderr) 72 | continue 73 | 74 | if not obj["coordinates"]: 75 | continue 76 | if not obj["coordinates"]["coordinates"]: 77 | continue 78 | if not obj["place"]: 79 | continue 80 | # if obj["place"]["country_code"] not in country_codes: 81 | # continue 82 | if not obj["user"]: 83 | continue 84 | 85 | if obj['id_str'] in known_ids: # duplicate 86 | continue 87 | 88 | text, time, lang = parse_tweet(obj) 89 | long, lat = parse_geo(obj) 90 | place, code = parse_place(obj) 91 | user = parse_user(obj) 92 | known_ids.add(id) 93 | 94 | yield (long, lat, text, time, lang, code, place, user) 95 | 96 | 97 | def read_train(filename): 98 | print("Reading data from:", filename) 99 | with open(filename, encoding='utf-8') as fid: 100 | lines = parse_train(fid) 101 | longs, lats, texts, times, langs, codes, places, users = zip(*lines) 102 | 103 | data_geo = { 104 | 'lon':longs, 105 | 'lat':lats, 106 | 'time':times, 107 | 'texts':texts, 108 | 'lang':langs, 109 | 'code':codes, 110 | 'place':places, 111 | 'user':users 112 | } 113 | 114 | print("Training set of ===", len(longs), "=== samples is collected") 115 | 116 | return pd.DataFrame(data_geo) 117 | 118 | 119 | def write_records(file, df): 120 | with open(file, "w") as f: 121 | df.to_json(f, orient='records', lines=True) 122 | print("Data written to file:", file) 123 | 124 | 125 | def combine(file): 126 | os.chdir(os.path.dirname(__file__) + r"/filtered_json") 127 | 128 | extension = 'txt' 129 | all_filenames = [i for i in glob.glob('*.{}'.format(extension))] 130 | 131 | combined_txt = pd.concat([pd.read_json(path_or_buf=f, lines=True) for f in all_filenames ]) 132 | 133 | #os.chdir(os.path.dirname(__file__)) 134 | 135 | with open(file, "w") as f: 136 | combined_txt.to_json(file, orient='records', lines=True) 137 | print(f"Data from {len(all_filenames)} files written to common dataset: {file}") 138 | 139 | 140 | test_input_file = '/run/user/1005618/gvfs/smb-share:server=archive3.ist.local,share=group/chlgrp/twitter-collection-2022/twitter-2022-01-25.txt' 141 | output_folder = "datasets/" 142 | 143 | 144 | def main(concat_to_one=False): 145 | try: # 1 arg - data input (txt) 146 | filename = sys.argv[1] 147 | except IndexError: 148 | filename = test_input_file 149 | print(f"Input file: {filename}") 150 | 151 | try: # 2 arg - filtered data output (jsonl) 152 | output_filename = sys.argv[2] 153 | except IndexError: 154 | head, tail = os.path.split(filename) 155 | output_filename = output_folder + tail 156 | open(output_filename, 'w').close() # if not exists 157 | print(f"Output file: {output_filename}") 158 | 159 | # manual testing 160 | # df_geo = read_train(filename) 161 | # write_records(output_filename, df_geo) 162 | 163 | try: 164 | df_geo = read_train(filename) 165 | write_records(output_filename, df_geo) 166 | except Exception as e: 167 | print("Couldn't form dataset:", e) 168 | 169 | if concat_to_one: # if all .txt per-day files are parsed - combine to single json 170 | combine("ca-twitter-2022.jsonl") 171 | 172 | 173 | if __name__ == "__main__": 174 | main() 175 | -------------------------------------------------------------------------------- /input_entry.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from utils.prediction import * 3 | from utils.regressor import * 4 | 5 | # Entry point for prediction from text (model .pth files needed) 6 | 7 | local_ww_models = { 8 | "gsop": "G-NON-GEO+GEO-ONLY-O1", 9 | "gmop": "G-NON-GEO+GEO-ONLY-O5", 10 | "psop": "P-NON-GEO+GEO-ONLY-O1", 11 | "pmop": "P-NON-GEO+GEO-ONLY-O5" 12 | } 13 | 14 | outcomes = 5 # 1 or 5 15 | prob = True # True or False 16 | 17 | features = ["NON-GEO", "GEO-ONLY"] 18 | 19 | text_example = "CIA and FBI can track anyone, and you willingly give the data away" 20 | 21 | local = False 22 | hub_model_prefix = "k4tel/geo-bert-multilingual" 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser(description='Prediction of geolocations') 27 | parser.add_argument('-o', '--outcomes', type=int, default=outcomes, help="Number of outcomes (lomg, lat) per tweet (default: 5)") 28 | parser.add_argument('-s', '--spat', action="store_true", help="Use geospatial model (default: probabilistic)") 29 | parser.add_argument('-l', '--local', action="store_true", help="Use model stored locally") 30 | parser.add_argument('-m', '--model', type=str, default=None, help='Filename prefix of local model OR HuggingFace repository link') 31 | parser.add_argument('-t', '--text', type=str, default=None, help='Text to process (max: 300 words)') 32 | args = parser.parse_args() 33 | 34 | weighted = args.outcomes > 1 35 | covariance = None if args.spat else "spher" 36 | 37 | if args.model: # models/final/.pth file; NOTE correct setup is needed 38 | prefix = args.model 39 | elif args.model is None and args.local: # picking local model according to the setup 40 | if outcomes > 1: 41 | local_model_prefix = local_ww_models["gmop"] if args.spat else local_ww_models["pmop"] 42 | else: 43 | local_model_prefix = local_ww_models["gsop"] if args.spat else local_ww_models["psop"] 44 | 45 | prefix = local_model_prefix 46 | else: # setup for P-NON-GEO+GEO-ONLY-O5 47 | weighted = True 48 | covariance = "spher" 49 | args.outcomes = 5 50 | args.spat = False 51 | prefix = hub_model_prefix 52 | 53 | # if not local - loading automatically on BERTregModel init 54 | model_wrapper = BERTregModel(args.outcomes, covariance, weighted, features, None, prefix) \ 55 | if not args.local else BERTregModel(args.outcomes, covariance, weighted, features) 56 | 57 | # if local - loading automatically on ModelOutput init 58 | prediction = ModelOutput(model_wrapper, prefix, args.local) 59 | 60 | print(f"MODEL\tBERT geo regression model is ready, you can now predict location from the text (300 words max) " 61 | f"in a form of {'Gaussian distributions (lon, lat, cov)' if prob else 'coordinates (lon, lat)'}" 62 | f" with {outcomes} possible prediction outcomes.\nNOTE\tOutcomes that have very low weight won't be displayed") 63 | 64 | text = args.text if args.text else input("Insert text: ") 65 | while text != "exit": 66 | if len(text) == 0: 67 | text = text_example 68 | if len(text.split()) < 300: 69 | result = prediction.prediction_output(text, filtering=True, visual=False) 70 | 71 | if args.outcomes > 1: 72 | ind = np.argwhere(np.round(result.weights[0, :] * 100, 2) > 0) 73 | significant = result.means[0, ind].reshape(-1, 2) 74 | weights = result.weights[0, ind].flatten() 75 | else: 76 | significant = result.means.reshape(-1, 2) 77 | weights = np.ones(1) 78 | 79 | sig_weights = np.round(weights * 100, 2) 80 | sig_weights = sig_weights[sig_weights > 0] 81 | 82 | print(f"RESULT\t{len(sig_weights)} significant prediction outcome(s):") 83 | 84 | for i in range(len(sig_weights)): 85 | point = f"lon: {' lat: '.join(map(str, significant[i]))}" 86 | print(f"\tOut {i + 1}\t{sig_weights[i]}%\t-\t{point}") 87 | 88 | else: 89 | print(f"Number of words is above 300, unable to process.") 90 | 91 | text = args.text if args.text else input("Insert text: ") 92 | 93 | 94 | if __name__ == "__main__": 95 | main() 96 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | geopandas==0.12.2 2 | geopy==2.3.0 3 | GPUtil==1.4.0 4 | imageio==2.25.1 5 | matplotlib==3.7.0 6 | moviepy==1.0.3 7 | numpy==1.21.5 8 | pandas==1.5.2 9 | plotly==5.13.1 10 | psutil==5.9.4 11 | pyarrow==11.0.0 12 | scikit_learn==1.2.1 13 | scipy==1.10.1 14 | seaborn==0.12.2 15 | Shapely==2.0.1 16 | spacy==3.5.0 17 | torch==1.13.1 18 | torchvision==0.14.1 19 | torchaudio==0.13.1 20 | torchtext==0.14.1 21 | fastai==2.7.11 22 | tokenizers 23 | torchdata==0.5.1 24 | tqdm==4.64.1 25 | transformers==4.26.0 26 | Basemap==1.3.6 27 | basemap-data-hires==1.3.2 28 | -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png -------------------------------------------------------------------------------- /results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/cum_dist_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-22.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-02.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-25.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-10-31.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-01.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_N300000_2022-11-23.png -------------------------------------------------------------------------------- /results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/density_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_sum-NP-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_N100000_2022-10-29.png -------------------------------------------------------------------------------- /results/img/gmm_contour_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_contour_plot.png -------------------------------------------------------------------------------- /results/img/gmm_likelihood_world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_likelihood_world.png -------------------------------------------------------------------------------- /results/img/gmm_user_summary_S300_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_user_summary_S300_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png -------------------------------------------------------------------------------- /results/img/gmm_user_summary_S38_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N218_VF-NON-GEO_2022-11-24_N218_2022-11-24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_user_summary_S38_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N218_VF-NON-GEO_2022-11-24_N218_2022-11-24.png -------------------------------------------------------------------------------- /results/img/gmm_user_summary_S500_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/gmm_user_summary_S500_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N6615_VF-NON-GEO_2022-11-17_N6615_2022-11-17.png -------------------------------------------------------------------------------- /results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png -------------------------------------------------------------------------------- /results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png -------------------------------------------------------------------------------- /results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png -------------------------------------------------------------------------------- /results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/results/img/text_map_U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_2022-11-23.png -------------------------------------------------------------------------------- /results/metric/EF-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-01-25.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 57.1224663336316 3 | Median SAE 3.630642382829759 4 | MSE 21.350589606782105 5 | MAE 0.8632753163412875 6 | Acc@100 94.73133333333334 7 | Acc@161 96.005 8 | Average CAE 97.14063901745017 9 | Median CAE 47.03875416171589 10 | Average 95% PRA 5.7877121133727405 11 | Median 95% PRA 2.9956897498650576 12 | PRA COVerage 0.7470066666666667 13 | Outcome ALL 3 14 | Average SAE 57.19130247060087 15 | Median SAE 3.6816462055878008 16 | MSE 21.341014506627996 17 | MAE 0.4321043145344873 18 | Acc@100 94.72033333333333 19 | Acc@161 96.00333333333333 20 | Average CAE 99.84322494836586 21 | Median CAE 47.643151346524434 22 | Average 95% PRA 8.826576905114056 23 | Median 95% PRA 7.581258966300876 24 | PRA COVerage 0.58095 25 | -------------------------------------------------------------------------------- /results/metric/EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-NON-GEO_2023-02-09.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1094.6138078166332 3 | Median SAE 769.7310232012596 4 | MSE 141.878554113195 5 | MAE 14.264336468087135 6 | Acc@100 0.4144736842105263 7 | Acc@161 0.8763157894736843 8 | Average CAE 1659.6462067284033 9 | Median CAE 1378.7820427028882 10 | Average 95% PRA 1676.6693654767037 11 | Median 95% PRA 1658.3572044843243 12 | PRA COVerage 0.028342105263157894 13 | Outcome ALL 3 14 | Average SAE 1094.6138086046483 15 | Median SAE 769.7310234018382 16 | MSE 141.87855446328854 17 | MAE 7.132168239106902 18 | Acc@100 0.4144736842105263 19 | Acc@161 0.8763157894736843 20 | Average CAE 1658.9157483435374 21 | Median CAE 1377.346818732903 22 | Average 95% PRA 177.55978655591733 23 | Median 95% PRA 176.67033683902014 24 | PRA COVerage 0.09657017543859649 25 | -------------------------------------------------------------------------------- /results/metric/EIS-US-U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N3e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N76000_VF-TEXT-ONLY_2023-02-09.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1215.7329909213397 3 | Median SAE 787.4934604866901 4 | MSE 199.027752393994 5 | MAE 15.800832527802372 6 | Acc@100 0.23947368421052634 7 | Acc@161 0.5342105263157895 8 | Average CAE 1773.335490256125 9 | Median CAE 1390.2649093982022 10 | Average 95% PRA 1658.4397793658538 11 | Median 95% PRA 1679.9652339034906 12 | PRA COVerage 0.01938157894736842 13 | Outcome ALL 3 14 | Average SAE 1216.029780347764 15 | Median SAE 787.4934607697677 16 | MSE 199.3000028756908 17 | MAE 7.9021518988569746 18 | Acc@100 0.23947368421052634 19 | Acc@161 0.5342105263157895 20 | Average CAE 1773.0230774157867 21 | Median CAE 1389.7878439109427 22 | Average 95% PRA 175.83709034876665 23 | Median 95% PRA 177.81759959293902 24 | PRA COVerage 0.09093421052631578 25 | -------------------------------------------------------------------------------- /results/metric/U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1266.1940182232079 3 | Median SAE 37.688264739505954 4 | MSE 743.222800867309 5 | MAE 17.220915844783917 6 | Acc@100 62.153000000000006 7 | Acc@161 67.02466666666666 8 | Average CAE 1292.5292902055553 9 | Median CAE 62.64815599155605 10 | Average 95% PRA 27.22535086762321 11 | Median 95% PRA 2.9983193329878755 12 | PRA COVerage 0.21840666666666667 13 | Outcome ALL 5 14 | Average SAE 1266.1951182548135 15 | Median SAE 37.68981554393958 16 | MSE 743.2222789348478 17 | MAE 8.610462794339353 18 | Acc@100 62.153666666666666 19 | Acc@161 67.02466666666666 20 | Average CAE 1292.554557406498 21 | Median CAE 62.64599706100019 22 | Average 95% PRA 14.736914482446378 23 | Median 95% PRA 7.515493132740927 24 | PRA COVerage 0.117392 25 | -------------------------------------------------------------------------------- /results/metric/U-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-29.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 3203.720367762154 3 | Median SAE 585.8696742244236 4 | MSE 2027.3966353043922 5 | MAE 44.29867222675726 6 | Acc@100 38.46633333333334 7 | Acc@161 41.17966666666666 8 | Average CAE 3225.9160141364423 9 | Median CAE 623.3095886278236 10 | Average 95% PRA 86.12589795786408 11 | Median 95% PRA 3.8511226439992745 12 | PRA COVerage 0.16536 13 | Outcome ALL 5 14 | Average SAE 3203.720654187707 15 | Median SAE 585.8698625325562 16 | MSE 2027.3946785780554 17 | MAE 22.149332638486232 18 | Acc@100 38.46633333333334 19 | Acc@161 41.17966666666666 20 | Average CAE 3225.8762513200772 21 | Median CAE 623.4343025445885 22 | Average 95% PRA 29.854989140654062 23 | Median 95% PRA 8.514526081273205 24 | PRA COVerage 0.07474533333333333 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 568.2775034320997 3 | Median SAE 32.0780867576 4 | MSE 334.60555609377246 5 | MAE 7.819104565829121 6 | Acc@100 72.53866666666666 7 | Acc@161 78.28633333333333 8 | Average CAE 639.2131073260726 9 | Median CAE 70.40403356928135 10 | Average 95% PRA 60.94092486527804 11 | Median 95% PRA 3.705517191563791 12 | PRA COVerage 0.19613666666666665 13 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2023-02-19_metric_N300000_VF-NON-GEO_2023-02-21.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 526.2267981509376 3 | Median SAE 37.7204090414 4 | MSE 259.39274008786316 5 | MAE 7.027323746037905 6 | Acc@100 69.76733333333334 7 | Acc@161 76.324 8 | Average CAE 598.4284945825225 9 | Median CAE 78.52907853415618 10 | Average 95% PRA 62.67164017585331 11 | Median 95% PRA 4.03528769698187 12 | PRA COVerage 0.17678666666666668 13 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1881.226679489545 3 | Median SAE 152.99867793205 4 | MSE 1118.1127814060806 5 | MAE 25.450553855346023 6 | Acc@100 46.009 7 | Acc@161 50.49133333333333 8 | Average CAE 1953.9709490024165 9 | Median CAE 352.7690710078982 10 | Average 95% PRA 174.00230971889167 11 | Median 95% PRA 60.76411961255744 12 | PRA COVerage 0.12542333333333333 13 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-24_metric_N300000_VF-NON-GEO_2022-11-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 559.6151196493181 3 | Median SAE 36.613367614249995 4 | MSE 319.76210796374187 5 | MAE 7.719847568522422 6 | Acc@100 72.44500000000001 7 | Acc@161 78.39566666666667 8 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1872.1827726693336 3 | Median SAE 140.2990609901 4 | MSE 1150.9366438239665 5 | MAE 25.576461171268132 6 | Acc@100 46.70166666666667 7 | Acc@161 51.282666666666664 8 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 552.887252254477 3 | Median SAE 32.92428538962793 4 | MSE 314.9492124902332 5 | MAE 7.626378681627015 6 | Acc@100 73.48566666666667 7 | Acc@161 78.86800000000001 8 | Average CAE 588.9925675399514 9 | Median CAE 64.98670458833743 10 | Average 95% PRA 36.763179226711436 11 | Median 95% PRA 3.3166217318692706 12 | PRA COVerage 0.08798 13 | Outcome ALL 10 14 | Average SAE 553.176748588044 15 | Median SAE 33.23239487280529 16 | MSE 315.00555681992574 17 | MAE 3.814997443082041 18 | Acc@100 73.442 19 | Acc@161 78.847 20 | Average CAE 599.809585161102 21 | Median CAE 77.92234001691176 22 | Average 95% PRA 15.566235104908506 23 | Median 95% PRA 9.425969628868724 24 | PRA COVerage 0.110402 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O10-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1844.6686523197552 3 | Median SAE 142.68284884641122 4 | MSE 1127.9185978737346 5 | MAE 25.23800925753932 6 | Acc@100 46.693 7 | Acc@161 51.11 8 | Average CAE 1896.672505641246 9 | Median CAE 191.6308764982617 10 | Average 95% PRA 173.8400432096454 11 | Median 95% PRA 5.282659731027015 12 | PRA COVerage 0.06211 13 | Outcome ALL 10 14 | Average SAE 1844.9623918688806 15 | Median SAE 143.21406110901034 16 | MSE 1127.9052539828947 17 | MAE 12.620134586598647 18 | Acc@100 46.653666666666666 19 | Acc@161 51.07633333333334 20 | Average CAE 1901.3933567712004 21 | Median CAE 214.22602320099966 22 | Average 95% PRA 32.41118289375993 23 | Median 95% PRA 13.948084191235818 24 | PRA COVerage 0.07766333333333333 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-08.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 568.6000967458361 3 | Median SAE 27.83860357461102 4 | MSE 315.35875458483565 5 | MAE 7.766317411950378 6 | Acc@100 73.52233333333334 7 | Acc@161 78.60233333333333 8 | Average CAE 608.2586993647415 9 | Median CAE 61.20042899812903 10 | Average 95% PRA 36.75226019373327 11 | Median 95% PRA 3.1656213706865604 12 | PRA COVerage 0.20137333333333332 13 | Outcome ALL 100 14 | Average SAE 568.9086226979437 15 | Median SAE 28.136057359410167 16 | MSE 315.3168704794486 17 | MAE 3.884708725501805 18 | Acc@100 73.476 19 | Acc@161 78.58766666666666 20 | Average CAE 618.2874970100345 21 | Median CAE 71.06793525191361 22 | Average 95% PRA 15.226239472174859 23 | Median 95% PRA 9.142879964102544 24 | PRA COVerage 0.011941366666666666 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O100-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-29_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1881.1270131620297 3 | Median SAE 149.12695451305 4 | MSE 1135.7307776956497 5 | MAE 25.583227937114227 6 | Acc@100 46.605999999999995 7 | Acc@161 50.644 8 | Average CAE 1939.090633727154 9 | Median CAE 199.66859359431743 10 | Average 95% PRA 193.15558184238012 11 | Median 95% PRA 4.445754656123253 12 | PRA COVerage 0.13863666666666666 13 | Outcome ALL 100 14 | Average SAE 1882.4342208517496 15 | Median SAE 149.79454355771463 16 | MSE 1137.260548701505 17 | MAE 12.797386544848854 18 | Acc@100 46.55233333333334 19 | Acc@161 50.61766666666667 20 | Average CAE 1942.8327829215752 21 | Median CAE 224.19472848188917 22 | Average 95% PRA 31.19339623431975 23 | Median 95% PRA 12.272656917861283 24 | PRA COVerage 0.0085576 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 561.5762779166488 3 | Median SAE 29.453758479280637 4 | MSE 311.4603138009121 5 | MAE 7.639022741766334 6 | Acc@100 73.50066666666667 7 | Acc@161 78.673 8 | Average CAE 597.5751942328264 9 | Median CAE 60.94959868185593 10 | Average 95% PRA 16.204564645261435 11 | Median 95% PRA 3.096197964297197 12 | PRA COVerage 0.12032666666666667 13 | Outcome ALL 3 14 | Average SAE 561.6625539878369 15 | Median SAE 29.544135384316714 16 | MSE 311.4622059291896 17 | MAE 3.820028644168509 18 | Acc@100 73.48633333333333 19 | Acc@161 78.669 20 | Average CAE 601.0722925067477 21 | Median CAE 63.27102175171326 22 | Average 95% PRA 13.92078117550932 23 | Median 95% PRA 7.925957260886589 24 | PRA COVerage 0.31235 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-18.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1876.1481224976758 3 | Median SAE 134.89082598225573 4 | MSE 1145.2134790067066 5 | MAE 25.60224848819392 6 | Acc@100 47.33266666666667 7 | Acc@161 51.528666666666666 8 | Average CAE 1908.385586945694 9 | Median CAE 209.7002456012217 10 | Average 95% PRA 45.47747936340816 11 | Median 95% PRA 11.299158256227699 12 | PRA COVerage 0.0764 13 | Outcome ALL 3 14 | Average SAE 1876.2305436206989 15 | Median SAE 134.93077903759564 16 | MSE 1145.2199094124885 17 | MAE 12.801477918607013 18 | Acc@100 47.315000000000005 19 | Acc@161 51.52366666666667 20 | Average CAE 1911.8091583122218 21 | Median CAE 219.9153649029191 22 | Average 95% PRA 24.103429804960783 23 | Median 95% PRA 16.185895312036987 24 | PRA COVerage 0.22035 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-18.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 556.2925544063847 3 | Median SAE 36.54243725535712 4 | MSE 314.97636029200527 5 | MAE 7.659210079387742 6 | Acc@100 72.64399999999999 7 | Acc@161 78.51700000000001 8 | Outcome ALL 3 9 | Average SAE 556.2938914782208 10 | Median SAE 36.54188096160445 11 | MSE 314.9766550857815 12 | MAE 3.8296141109586066 13 | Acc@100 72.64399999999999 14 | Acc@161 78.51700000000001 15 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O3-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-19.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1859.6971406684952 3 | Median SAE 142.77144003340806 4 | MSE 1141.0929485863185 5 | MAE 25.480128561718153 6 | Acc@100 46.49333333333333 7 | Acc@161 51.11866666666667 8 | Outcome ALL 3 9 | Average SAE 1859.696318863178 10 | Median SAE 142.77144269260805 11 | MSE 1141.0931166967357 12 | MAE 12.740060355875377 13 | Acc@100 46.49333333333333 14 | Acc@161 51.117999999999995 15 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-05.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 626.5331734393261 3 | Median SAE 69.84155282542723 4 | MSE 317.31179424625583 5 | MAE 8.472252410206108 6 | Acc@100 59.728 7 | Acc@161 70.66799999999999 8 | Average CAE 689.7891627867381 9 | Median CAE 156.3155615968835 10 | Average 95% PRA 37.68922225874048 11 | Median 95% PRA 17.541909748063617 12 | PRA COVerage 0.07858 13 | Outcome ALL 5 14 | Average SAE 626.6558752911719 15 | Median SAE 70.04934056289767 16 | MSE 317.2918178901608 17 | MAE 4.236919440100403 18 | Acc@100 59.694 19 | Acc@161 70.65766666666666 20 | Average CAE 691.0206138642998 21 | Median CAE 158.53082102124682 22 | Average 95% PRA 21.285102514536906 23 | Median 95% PRA 18.52029142785464 24 | PRA COVerage 0.09724933333333333 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-all_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-05.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 2036.690820895306 3 | Median SAE 234.72389085167492 4 | MSE 1191.569184271369 5 | MAE 27.367099908307914 6 | Acc@100 38.20033333333333 7 | Acc@161 45.15166666666667 8 | Average CAE 2080.0969230715614 9 | Median CAE 356.8301489056366 10 | Average 95% PRA 56.656404871128125 11 | Median 95% PRA 43.787506290806675 12 | PRA COVerage 0.04188666666666667 13 | Outcome ALL 5 14 | Average SAE 2036.7343356781525 15 | Median SAE 234.8195810981424 16 | MSE 1191.543656971664 17 | MAE 13.68385871031472 18 | Acc@100 38.18266666666666 19 | Acc@161 45.14666666666667 20 | Average CAE 2080.868258573548 21 | Median CAE 357.46473741475245 22 | Average 95% PRA 27.878607591261453 23 | Median 95% PRA 28.77262525130461 24 | PRA COVerage 0.06187533333333333 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N100_VF-NON-GEO_2022-11-17.txt: -------------------------------------------------------------------------------- 1 | Average SAE 119.66019516154287 2 | Median SAE 17.810982552272733 3 | MSE 16.20156415636118 4 | MAE 0.7875519875345544 5 | Acc@100 92.0 6 | Acc@161 93.0 7 | Average CAE 166.41984894274432 8 | Median CAE 62.34761531265032 9 | Average 95% PRA 10.79242328166499 10 | Median 95% PRA 8.531194412932471 11 | PRA COVerage 0.3 12 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-20.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 551.0441079868583 3 | Median SAE 29.009580915784433 4 | MSE 302.38283684822113 5 | MAE 7.530359371708234 6 | Acc@100 73.82233333333333 7 | Acc@161 79.01599999999999 8 | Average CAE 587.5484498388176 9 | Median CAE 61.411216369677334 10 | Average 95% PRA 25.972122482501153 11 | Median 95% PRA 3.07923791039419 12 | PRA COVerage 0.12674333333333335 13 | Outcome ALL 5 14 | Average SAE 551.3804511944757 15 | Median SAE 29.40786386084497 16 | MSE 302.45818164043794 17 | MAE 3.7675011234359808 18 | Acc@100 73.77966666666667 19 | Acc@161 78.98366666666666 20 | Average CAE 600.3474117451983 21 | Median CAE 79.1162587443271 22 | Average 95% PRA 15.252935002360253 23 | Median 95% PRA 9.642332670934938 24 | PRA COVerage 0.23365133333333332 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 880.2045442181371 3 | Median SAE 24.089552026815696 4 | MSE 1043.625869290649 5 | MAE 14.203196909385907 6 | Acc@100 76.47033333333333 7 | Acc@161 80.278 8 | Average CAE 915.5020383759471 9 | Median CAE 58.556179799350836 10 | Average 95% PRA 22.806675555306633 11 | Median 95% PRA 3.051213891111107 12 | PRA COVerage 0.15504 13 | Outcome ALL 5 14 | Average SAE 880.4064974262934 15 | Median SAE 24.553067239616468 16 | MSE 1043.5267443249566 17 | MAE 7.102857868789059 18 | Acc@100 76.43100000000001 19 | Acc@161 80.26666666666667 20 | Average CAE 926.048857600671 21 | Median CAE 72.42235829900109 22 | Average 95% PRA 14.245569049022276 23 | Median 95% PRA 9.008763377993043 24 | PRA COVerage 0.23922133333333334 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1587.7103717657617 3 | Median SAE 49.43012531452397 4 | MSE 1071.5542114028951 5 | MAE 21.942927961935947 6 | Acc@100 57.03366666666667 7 | Acc@161 60.70166666666667 8 | Average CAE 1627.8312262807785 9 | Median CAE 77.75976669138876 10 | Average 95% PRA 73.95988109756931 11 | Median 95% PRA 3.238094487707059 12 | PRA COVerage 0.12459666666666666 13 | Outcome ALL 5 14 | Average SAE 1587.5032657173201 15 | Median SAE 49.839323052728304 16 | MSE 1071.209588332399 17 | MAE 10.97015378954644 18 | Acc@100 56.985 19 | Acc@161 60.668666666666674 20 | Average CAE 1635.872835341197 21 | Median CAE 105.6462485079305 22 | Average 95% PRA 23.616667714973637 23 | Median 95% PRA 11.601910217762153 24 | PRA COVerage 0.18414266666666668 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-NON-GEO_2023-02-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 567.3842990687558 3 | Median SAE 26.210280141113937 4 | MSE 514.1601806344198 5 | MAE 8.524774971330553 6 | Acc@100 77.74 7 | Acc@161 82.12 8 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N5000_UVF-TEXT-ONLY_2023-02-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 892.2345579521816 3 | Median SAE 31.171950625491604 4 | MSE 606.8715538241867 5 | MAE 12.631638517530538 6 | Acc@100 69.44 7 | Acc@161 74.1 8 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100_VF-NON-GEO_2022-11-17_metric_N100_VF-NON-GEO_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 118.96837800851797 3 | Median SAE 17.5024042712 4 | MSE 16.143209358759403 5 | MAE 1.5657629981299996 6 | Acc@100 93.0 7 | Acc@161 93.0 8 | Average CAE 151.89993443774924 9 | Median CAE 53.5580648326241 10 | Average 95% PRA 5.035829398679516 11 | Median 95% PRA 3.018119173570053 12 | PRA COVerage 0.19 13 | Outcome ALL 5 14 | Average SAE 119.66019497129344 15 | Median SAE 17.810982220624744 16 | MSE 16.20156407447996 17 | MAE 0.7875519863563727 18 | Acc@100 92.0 19 | Acc@161 93.0 20 | Average CAE 166.96709316134712 21 | Median CAE 62.70323179974559 22 | Average 95% PRA 10.792423280458388 23 | Median 95% PRA 8.531194419025953 24 | PRA COVerage 0.3 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-11-02_metric_N300000_VF-NON-GEO_2022-11-23.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 551.0441230576836 3 | Median SAE 29.00965624275 4 | MSE 302.38285051052117 5 | MAE 7.530359547384761 6 | Acc@100 73.82233333333333 7 | Acc@161 79.01599999999999 8 | Average CAE 587.602272535663 9 | Median CAE 61.428721501169356 10 | Average 95% PRA 25.972120676419227 11 | Median 95% PRA 3.0792380510887263 12 | PRA COVerage 0.12673666666666666 13 | Outcome ALL 5 14 | Average SAE 551.3804664537547 15 | Median SAE 29.40809293191692 16 | MSE 302.45819563128003 17 | MAE 3.7675012125909304 18 | Acc@100 73.77966666666667 19 | Acc@161 78.98366666666666 20 | Average CAE 600.4000066790437 21 | Median CAE 79.12131175647453 22 | Average 95% PRA 15.252933884700132 23 | Median 95% PRA 9.642331701034896 24 | PRA COVerage 0.233652 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-02_metric_N300000_VF-TEXT-ONLY_2022-11-23.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1845.00937803803 3 | Median SAE 135.06693287939999 4 | MSE 1102.7013834707896 5 | MAE 25.130295035377046 6 | Acc@100 47.35433333333333 7 | Acc@161 51.53033333333333 8 | Average CAE 1887.652430082346 9 | Median CAE 187.72449180133725 10 | Average 95% PRA 97.19770537206963 11 | Median 95% PRA 3.6437647577106986 12 | PRA COVerage 0.07906 13 | Outcome ALL 5 14 | Average SAE 1845.0441404326284 15 | Median SAE 135.87013273111606 16 | MSE 1102.7810905095876 17 | MAE 12.565707118988758 18 | Acc@100 47.30466666666667 19 | Acc@161 51.49433333333333 20 | Average CAE 1896.2714259390013 21 | Median CAE 214.6930851721048 22 | Average 95% PRA 27.91029143301712 23 | Median 95% PRA 15.158291563378132 24 | PRA COVerage 0.158866 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-23.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 577.751999736082 3 | Median SAE 35.62144710244375 4 | MSE 341.1656133959407 5 | MAE 7.96456962776988 6 | Acc@100 72.74166666666667 7 | Acc@161 78.59766666666667 8 | Outcome ALL 5 9 | Average SAE 577.8066991457856 10 | Median SAE 35.62152013386192 11 | MSE 341.24395821142076 12 | MAE 3.9826415725575184 13 | Acc@100 72.74166666666667 14 | Acc@161 78.59766666666667 15 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1986.703707385929 3 | Median SAE 151.27159786246557 4 | MSE 1209.6200475465141 5 | MAE 26.87739020900269 6 | Acc@100 46.025666666666666 7 | Acc@161 50.599666666666664 8 | Outcome ALL 5 9 | Average SAE 1986.5804809058 10 | Median SAE 151.271602511181 11 | MSE 1209.4877847369746 12 | MAE 13.43808280982075 13 | Acc@100 46.025666666666666 14 | Acc@161 50.599666666666664 15 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2022-10-31_metric_N300000_VF-NON-GEO_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 7812.566795185766 3 | Median SAE 7809.646722224799 4 | MSE 3769.8216207682385 5 | MAE 94.56699774904601 6 | Acc@100 0.11199999999999999 7 | Acc@161 0.48766666666666664 8 | Outcome ALL 5 9 | Average SAE 7812.559883244672 10 | Median SAE 7809.646722224799 11 | MSE 3769.814875589674 12 | MAE 47.283439786580686 13 | Acc@100 0.11199999999999999 14 | Acc@161 0.48766666666666664 15 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-TEXT-ONLY_2022-11-01_metric_N300000_VF-TEXT-ONLY_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 7976.811124753898 3 | Median SAE 7782.067641114199 4 | MSE 3896.4071088331716 5 | MAE 96.63334555148514 6 | Acc@100 0.043 7 | Acc@161 0.198 8 | Outcome ALL 5 9 | Average SAE 7976.642842875767 10 | Median SAE 7781.7176385929 11 | MSE 3896.2099554995943 12 | MAE 48.31548389645203 13 | Acc@100 0.043 14 | Acc@161 0.198 15 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-12-09.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 556.596930407323 3 | Median SAE 29.012107235794964 4 | MSE 310.7896756651852 5 | MAE 7.625280180899602 6 | Acc@100 73.92433333333334 7 | Acc@161 78.97833333333332 8 | Average CAE 592.3338335828001 9 | Median CAE 60.74781081627437 10 | Average 95% PRA 23.17149074608845 11 | Median 95% PRA 3.0816176045918335 12 | PRA COVerage 0.12573333333333334 13 | Outcome ALL 50 14 | Average SAE 556.8729892447342 15 | Median SAE 29.349274534387682 16 | MSE 310.8038665670043 17 | MAE 3.814452796575479 18 | Acc@100 73.88366666666667 19 | Acc@161 78.94566666666667 20 | Average CAE 604.414241057673 21 | Median CAE 77.04904126035898 22 | Average 95% PRA 14.734713213583234 23 | Median 95% PRA 9.532355912466208 24 | PRA COVerage 0.020491866666666667 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO+GEO-ONLY-O50-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-12-09.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1855.923764156897 3 | Median SAE 136.03652401294164 4 | MSE 1117.6151224207629 5 | MAE 25.326143908743465 6 | Acc@100 47.25366666666667 7 | Acc@161 51.409000000000006 8 | Average CAE 1898.1937180477119 9 | Median CAE 189.36825804431618 10 | Average 95% PRA 99.38303170048808 11 | Median 95% PRA 3.478817313221037 12 | PRA COVerage 0.08412 13 | Outcome ALL 50 14 | Average SAE 1855.9443449215396 15 | Median SAE 136.6271873705624 16 | MSE 1117.4904022529818 17 | MAE 12.66304185335675 18 | Acc@100 47.215 19 | Acc@161 51.39833333333333 20 | Average CAE 1904.9692293352764 21 | Median CAE 214.7305412802197 22 | Average 95% PRA 27.77954768432691 23 | Median 95% PRA 14.017333622246433 24 | PRA COVerage 0.012973666666666666 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-28.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 580.6090313522942 3 | Median SAE 46.41566130380352 4 | MSE 312.193184142252 5 | MAE 7.915783033433179 6 | Acc@100 69.607 7 | Acc@161 76.45566666666667 8 | Average CAE 613.9193562319141 9 | Median CAE 81.65887706130992 10 | Average 95% PRA 21.852941664343778 11 | Median 95% PRA 3.7498956132762036 12 | PRA COVerage 0.07282666666666666 13 | Outcome ALL 5 14 | Average SAE 580.970835247516 15 | Median SAE 46.88263239257573 16 | MSE 312.16432163494824 17 | MAE 3.9602365100725545 18 | Acc@100 69.53033333333335 19 | Acc@161 76.40966666666667 20 | Average CAE 635.19984691036 21 | Median CAE 106.95600581436469 22 | Average 95% PRA 15.877051629880834 23 | Median 95% PRA 11.788060470218173 24 | PRA COVerage 0.12605666666666668 25 | -------------------------------------------------------------------------------- /results/metric/U-NON-GEO-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E2-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-28.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1875.2932581156072 3 | Median SAE 172.9306858701335 4 | MSE 1099.589240160285 5 | MAE 25.47369988597128 6 | Acc@100 44.204 7 | Acc@161 49.30766666666667 8 | Average CAE 1911.9219339073627 9 | Median CAE 208.6942984449754 10 | Average 95% PRA 95.75923460662027 11 | Median 95% PRA 4.8870412499624925 12 | PRA COVerage 0.04433 13 | Outcome ALL 5 14 | Average SAE 1875.3638156446036 15 | Median SAE 172.93952006940208 16 | MSE 1099.3982200402288 17 | MAE 12.738032951488462 18 | Acc@100 44.167 19 | Acc@161 49.288 20 | Average CAE 1927.2933304174228 21 | Median CAE 235.14731426613002 22 | Average 95% PRA 24.574656301155283 23 | Median 95% PRA 13.810513023648852 24 | PRA COVerage 0.075412 25 | -------------------------------------------------------------------------------- /results/metric/U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2022-11-30.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 781.7706650795018 3 | Median SAE 86.86877967943047 4 | MSE 407.88967359115975 5 | MAE 10.617183446501544 6 | Acc@100 53.806 7 | Acc@161 64.97433333333333 8 | Average CAE 902.3277154947419 9 | Median CAE 234.14109002107455 10 | Average 95% PRA 148.92441323319156 11 | Median 95% PRA 24.70561775871207 12 | PRA COVerage 0.13995 13 | Outcome ALL 5 14 | Average SAE 782.0235068256861 15 | Median SAE 87.21493077857644 16 | MSE 407.93450996375645 17 | MAE 5.310163434238611 18 | Acc@100 53.727000000000004 19 | Acc@161 64.93766666666667 20 | Average CAE 913.1045627056881 21 | Median CAE 267.13501018077693 22 | Average 95% PRA 36.20489287325245 23 | Median 95% PRA 26.438525828931986 24 | PRA COVerage 0.118212 25 | -------------------------------------------------------------------------------- /results/metric/U-TEXT-ONLY-O5-d-total_mean-pos_spher-weighted-N30e5-B12-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2022-11-30.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1547.0540351194697 3 | Median SAE 176.21210408917602 4 | MSE 839.0551868683772 5 | MAE 20.879623384049147 6 | Acc@100 41.24033333333333 7 | Acc@161 48.747 8 | Average CAE 1710.817416332019 9 | Median CAE 423.7717242604772 10 | Average 95% PRA 546.0467202967111 11 | Median 95% PRA 59.65755199413765 12 | PRA COVerage 0.11468333333333333 13 | Outcome ALL 5 14 | Average SAE 1547.3396256779183 15 | Median SAE 176.48929304210884 16 | MSE 839.1979633426333 17 | MAE 10.440824292898615 18 | Acc@100 41.199000000000005 19 | Acc@161 48.708666666666666 20 | Average CAE 1708.7545371274753 21 | Median CAE 442.1363776181922 22 | Average 95% PRA 58.81665073457992 23 | Median 95% PRA 38.20872910249881 24 | PRA COVerage 0.08802666666666667 25 | -------------------------------------------------------------------------------- /results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N0e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N50_VF-NON-GEO_2022-11-22.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 9914.668096263977 3 | Median SAE 9995.085508992233 4 | MSE 5357.341812998586 5 | MAE 119.35189060703888 6 | Acc@100 0.0 7 | Acc@161 0.0 8 | Average CAE 9914.562013061304 9 | Median CAE 10002.199218399091 10 | Average 95% PRA 12.175786353180174 11 | Median 95% PRA 11.89449912855745 12 | PRA COVerage 0.0 13 | Outcome ALL 5 14 | Average SAE 9926.753104973639 15 | Median SAE 10001.339856228697 16 | MSE 5363.141463231329 17 | MAE 59.75125683217192 18 | Acc@100 0.0 19 | Acc@161 0.0 20 | Average CAE 9927.272739097398 21 | Median CAE 10007.489719519272 22 | Average 95% PRA 16.772039178406793 23 | Median 95% PRA 16.734159757402896 24 | PRA COVerage 0.0 25 | -------------------------------------------------------------------------------- /results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-NON-GEO_2023-02-23.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 802.0893183793827 3 | Median SAE 24.974492747904417 4 | MSE 151.58256803467805 5 | MAE 9.968539887598322 6 | Acc@100 61.97266666666666 7 | Acc@161 63.995999999999995 8 | Average CAE 836.093719699225 9 | Median CAE 57.03076756833259 10 | Average 95% PRA 49.5394874676072 11 | Median 95% PRA 2.9997513563907243 12 | PRA COVerage 0.30149333333333334 13 | Outcome ALL 5 14 | Average SAE 802.0893183927425 15 | Median SAE 24.974492747940083 16 | MSE 151.58256804297062 17 | MAE 4.984269943866374 18 | Acc@100 61.97266666666666 19 | Acc@161 63.995999999999995 20 | Average CAE 836.0937366044021 21 | Median CAE 57.07486283490132 22 | Average 95% PRA 20.88662783493309 23 | Median 95% PRA 7.513931392525943 24 | PRA COVerage 0.16537466666666667 25 | -------------------------------------------------------------------------------- /results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N300000_VF-TEXT-ONLY_2023-02-23.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 1162.604288374721 3 | Median SAE 598.8986795557007 4 | MSE 198.96711438281176 5 | MAE 14.63374204393779 6 | Acc@100 39.416333333333334 7 | Acc@161 41.291666666666664 8 | Average CAE 1204.8100220315987 9 | Median CAE 679.5672225740713 10 | Average 95% PRA 104.09604027290496 11 | Median 95% PRA 63.443590773093476 12 | PRA COVerage 0.20825333333333335 13 | Outcome ALL 5 14 | Average SAE 1162.604288516567 15 | Median SAE 598.8986795584461 16 | MSE 198.96711445363616 17 | MAE 7.316871022877962 18 | Acc@100 39.416333333333334 19 | Acc@161 41.291666666666664 20 | Average CAE 1204.9063324921221 21 | Median CAE 678.7367829126739 22 | Average 95% PRA 35.27105756637679 23 | Median 95% PRA 34.555602828281465 24 | PRA COVerage 0.11697533333333333 25 | -------------------------------------------------------------------------------- /results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-NON-GEO_2023-02-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 375.03136106819596 3 | Median SAE 12.91331621363022 4 | MSE 92.0769672561113 5 | MAE 4.7583405970026575 6 | Acc@100 80.63612722544508 7 | Acc@161 83.19663932786557 8 | -------------------------------------------------------------------------------- /results/metric/US-U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_metric_N4999_UVF-TEXT-ONLY_2023-02-24.txt: -------------------------------------------------------------------------------- 1 | Outcome BEST 2 | Average SAE 431.1931917816224 3 | Median SAE 14.518444548017039 4 | MSE 98.51503430916212 5 | MAE 5.539108098993033 6 | Acc@100 75.77515503100621 7 | Acc@161 78.11562312462492 8 | -------------------------------------------------------------------------------- /runs/prob/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665082080.gpu127.3070880.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/prob/U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665082080.gpu127.3070880.0 -------------------------------------------------------------------------------- /runs/prob/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665062385.gpu118.2632089.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/prob/U-NON-GEO+GEO-ONLY-O5-d-total_mean-mf_mean-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-06_logs/events.out.tfevents.1665062385.gpu118.2632089.0 -------------------------------------------------------------------------------- /runs/spat/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-09_logs/events.out.tfevents.1665313776.gpu113.3164054.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/spat/U-NON-GEO+GEO-ONLY-O1-d-total_type-mf_mean-NP-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-09_logs/events.out.tfevents.1665313776.gpu113.3164054.0 -------------------------------------------------------------------------------- /runs/spat/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-23_logs/events.out.tfevents.1666523343.gpu148.1960620.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/runs/spat/U-NON-GEO+GEO-ONLY-O5-d-total_type-mf_mean-NP-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]-2022-10-23_logs/events.out.tfevents.1666523343.gpu148.1960620.0 -------------------------------------------------------------------------------- /supplementary_resources/article_draft.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/article_draft.pdf -------------------------------------------------------------------------------- /supplementary_resources/img/loss-graph-prob.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/loss-graph-prob.png -------------------------------------------------------------------------------- /supplementary_resources/img/loss-graph-spat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/loss-graph-spat.png -------------------------------------------------------------------------------- /supplementary_resources/img/map-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/map-density.png -------------------------------------------------------------------------------- /supplementary_resources/img/model-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/model-train.png -------------------------------------------------------------------------------- /supplementary_resources/img/mop-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/mop-loss.png -------------------------------------------------------------------------------- /supplementary_resources/img/prediction-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/prediction-example.png -------------------------------------------------------------------------------- /supplementary_resources/img/sop-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/sop-loss.png -------------------------------------------------------------------------------- /supplementary_resources/img/total-loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/K4TEL/geo-twitter/459ca2e294d86a7075d8c11ae323a5fba84ca2e8/supplementary_resources/img/total-loss.png -------------------------------------------------------------------------------- /supplementary_resources/scripts/bash/collector.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH -p defaultp # partition (queue) 3 | #SBATCH -N 1 # number of nodes 4 | #SBATCH -c 4 # number of cpus (cores) 5 | #SBATCH --mem 150G # memory pool for all cores # 32GB for folktables ablation 6 | #SBATCH --time 0-1:59:00 # time (D-HH:MM:SS) 7 | #SBATCH --job-name=clt-data 8 | #SBATCH -o world.%A_%a.%N.out # STDOUT 9 | #SBATCH -e world.%A_%a.%N.err # STDERR 10 | #SBATCH --requeue 11 | 12 | SEED=${SLURM_ARRAY_TASK_ID:-0} 13 | SEEDSTR=$( printf "%01d" $SEED ) 14 | 15 | hostname 16 | date 17 | 18 | module load python/3.8 19 | source ~/twitter-env/bin/activate 20 | 21 | export OMP_NUM_THREADS=4 22 | 23 | ARG=( "$@" ) 24 | FILE="${ARG[0]}" 25 | 26 | if [ -e ${FILE} ] 27 | then 28 | echo ${FILE} exists 29 | stat -L -c "%a %G %U" ${FILE} 30 | cd ${HOME}/geo-twitter/ 31 | srun python -u data-collector.py ${ARG[*]} 32 | else 33 | echo ${FILE} does not exist 34 | fi 35 | 36 | date 37 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/bash/data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES20="/archive3/group/chlgrp/twitter-collection-2020/twitter-2020-*.txt" 4 | FILES21="/archive3/group/chlgrp/twitter-collection-2021/twitter-2021-*.txt" 5 | FILES22="/archive3/group/chlgrp/twitter-collection-2022/twitter-2022-*.txt" 6 | 7 | FILES=( $FILES20 $FILES21 $FILES22 ) 8 | total=${#FILES[@]} 9 | echo Total number of files is ${total} 10 | c=0 11 | 12 | 13 | for f in "${FILES[@]}" 14 | do 15 | if [ -e ${f} ] 16 | then 17 | echo ${f} exists 18 | stat -L -c "%a %G %U" ${f} 19 | if [ ! -s ${HOME}/geo-twitter/datasets/world/${f:47:18}.txt ] 20 | then 21 | echo filtered file is empty or does not exist 22 | sbatch collector.sh ${f} 23 | c=$((c+1)) 24 | else 25 | echo filtered dataset is already collected 26 | fi 27 | else 28 | echo ${f} does not exist 29 | fi 30 | done 31 | 32 | 33 | echo Total number of files is ${total} 34 | echo Total number of scripts launched is ${c} 35 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/bash/train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --ntasks=1 3 | #SBATCH --partition=gpu 4 | #SBATCH --gres=gpu:1 5 | #SBATCH --constraint=GTX1080Ti 6 | #SBATCH --cpus-per-task=12 7 | #SBATCH --export=NONE 8 | #SBATCH --mem 100G # memory pool for all cores # 32GB for folktables ablation 9 | #SBATCH --time 10-00:00:00 # time (D-HH:MM:SS) 10 | #SBATCH --job-name=world-model 11 | #SBATCH -o world-model.%A_%a.%N.out # STDOUT 12 | #SBATCH -e world-model.%A_%a.%N.err # STDERR 13 | #SBATCH --requeue 14 | 15 | unset SLURM_EXPORT_ENV 16 | 17 | SEED=${SLURM_ARRAY_TASK_ID:-0} 18 | SEEDSTR=$( printf "%01d" $SEED ) 19 | 20 | hostname 21 | date 22 | 23 | module load python/3.8 24 | module load cuda 25 | source ${HOME}/twitter-env/bin/activate 26 | 27 | export OMP_NUM_THREADS=12 28 | export CUDA_VISIBLE_DEVICE=0 29 | 30 | cd ${HOME}/geo-twitter/ 31 | 32 | ARG=( "$@" ) 33 | srun --cpu_bind=verbose python -u train_bert.py ${ARG[*]} 34 | 35 | date 36 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/bert_train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import BertTokenizer 3 | from utils.model_trainer import * 4 | 5 | # Entry point for training anf evaluation of the models 6 | 7 | f = ["GEO", "NON-GEO", "META-DATA", "TEXT-ONLY", "GEO-ONLY", "USER-ONLY"] 8 | 9 | dataset_file = 'worldwide-twitter-day-small.jsonl' # .jsonl 10 | features = [f[1], f[4]] 11 | val_f = None # None -> features[0] 12 | target_columns = ["lon", "lat"] 13 | 14 | original_worldwide_model = "bert-base-multilingual-cased" 15 | original_usa_model = "bert-base-cased" 16 | 17 | # parameters = dict( 18 | # max_lr = [5e-5, 1e-5], 19 | # min_lr = [5e-6, 1e-6, 1e-8, 1e-16], 20 | # scheduler = ["cosine", "plateau"] 21 | # ) 22 | # param_values = [v for v in parameters.values()] 23 | 24 | covariance_types = [None, "spher"] # [None, "full", "spher", "diag", "tied"] 25 | scheduler_types = ["cosine", "linear", "plateau"] # ["cosine", "linear", "cosine-long", "plateau", "step", "multi step", "one cycle", "cyclic"] 26 | 27 | loss_distance = True 28 | loss_mf = "mean" # mean/sum - mean if features > 1 29 | loss_prob = "pos" # all/pos - pos if prob 30 | loss_total = "mean" # sum/mean/type - mean if prob else type (spat) 31 | 32 | outcomes = 5 33 | covariance = covariance_types[1] # None/spher 34 | 35 | epochs = 3 36 | log_step = 1000 37 | 38 | batch_size = 4 39 | 40 | lr_max = 1e-5 41 | lr_min = 1e-6 42 | scheduler = scheduler_types[0] 43 | 44 | val_size = 1000 # samples/users if -vu 45 | threshold = 100 46 | 47 | train_size = 0 48 | test_ratio = 0.1 49 | seed = 42 50 | 51 | 52 | def main(): 53 | parser = argparse.ArgumentParser(description='Finetune multilingual transformer model') 54 | 55 | # common 56 | parser.add_argument('-n', '--n-epochs', type=int, default=epochs, help='Number of the training epochs') 57 | parser.add_argument('-b', '--batch_size', type=int, default=batch_size, help='Batch size (default: 12)') 58 | 59 | parser.add_argument('-m', '--local_model', type=str, default=None, 60 | help='Filename prefix of local model (!NOTE must fit related args)') 61 | 62 | parser.add_argument('-bb', '--bert_base', type=str, default=None, 63 | help="BERT base model (default: worldwide)") 64 | 65 | 66 | parser.add_argument('--train', action="store_true", help="Start finetuning") 67 | parser.add_argument('--eval', action="store_true", help="Start evaluation") 68 | parser.add_argument('--hptune', action="store_true", help="Start training with hyper parameters tuning") 69 | 70 | # dataset 71 | parser.add_argument('-d', '--dataset', type=str, default=dataset_file, help="Input dataset (in jsonl format)") 72 | parser.add_argument('-ss', '--skip', type=int, default=0, help='Number of dataset samples to skip') 73 | 74 | parser.add_argument('-f', '--features', default=features, nargs='+', help="Feature column names") 75 | parser.add_argument('-t', '--target-cols', default=target_columns, nargs='+', help="Target column names") 76 | 77 | parser.add_argument('-s', '--seed', type=int, default=seed, help='Random seed (default: 42)') 78 | 79 | 80 | parser.add_argument('-nnv', '--norm_numb', action="store_true", 81 | help="Normalize labels' number values (default: False)") 82 | 83 | parser.add_argument('-cv', '--class_val', action="store_true", 84 | help="Classification labels as values (default: False)") 85 | 86 | 87 | # --train 88 | parser.add_argument('--no-ckp', action="store_false", help='Saving model checkpoints during training (Default: True)') 89 | 90 | parser.add_argument('-ls', '--log_step', type=int, default=log_step, help='Log step (default: 1000)') 91 | 92 | parser.add_argument('-lr', '--learn_rate', type=float, default=lr_max, 93 | help='Learning rate maximum to start from (default: 4e-5)') 94 | parser.add_argument('-lrm', '--learn_rate_min', type=float, default=lr_min, 95 | help='Learning rate minimum to end on (default: 1e-8)') 96 | parser.add_argument('-sdl', '--scheduler', type=str, default=scheduler, help="Scheduler type (Default: 'cosine')") 97 | 98 | parser.add_argument('-ts', '--train_size', type=int, default=train_size, help='Training dataloader size') 99 | parser.add_argument('-tr', '--test_ratio', type=float, default=test_ratio, help='Training dataloader test data ratio (default: 0.1)') 100 | 101 | parser.add_argument('-lmf', '--loss_mf', type=str, default=loss_mf, 102 | help="Multi feature loss handle mean or sum (default: mean)") 103 | 104 | 105 | # --eval 106 | parser.add_argument('-v', '--val_size', type=int, default=val_size, help='Validation dataloader size') 107 | 108 | # geo specific 109 | parser.add_argument('-o', '--outcomes', type=int, default=outcomes, help="Number of outcomes (lomg, lat) per tweet") 110 | parser.add_argument('-ld', '--loss_dist', action="store_false", help="Distance loss criterion (default: True)") 111 | 112 | parser.add_argument('-lp', '--loss_prob', type=str, default=loss_prob, 113 | help="GMM neg LLH loss stays in domain 'all' or 'pos' (default: 'pos')") 114 | parser.add_argument('-c', '--covariance', type=str, default=covariance, 115 | help="GMM covariance matrix type (Default: 'spher')") 116 | parser.add_argument('-nw', '--not-weighted', action="store_false", 117 | help="GMM weights are not equal (default: True)") 118 | 119 | parser.add_argument('-lt', '--loss_total', type=str, default=loss_total, 120 | help="Total loss handle by model's 'type' or 'sum' of all loss values (default: 'type')") 121 | 122 | args = parser.parse_args() 123 | 124 | if args.local_model is None: 125 | prefix = f"{'US-' if args.usa_model else ''}{'U-' if not args.scale_coord else ''}{'+'.join(args.features)}-O{args.outcomes}-{'d' if args.loss_dist else 'c'}-" \ 126 | f"total_{args.loss_total if args.covariance is not None else 'type'}-{'mf_' + args.loss_mf + '-' if len(args.features) > 1 else ''}" \ 127 | f"{args.loss_prob + '_' if args.covariance is not None else ''}{args.covariance if args.covariance is not None else 'NP'}-" \ 128 | f"{'weighted-' if args.weighted and args.outcomes > 1 else ''}N{args.train_size//100000}e5-" \ 129 | f"B{args.batch_size}-E{args.nepochs}-{args.scheduler}-LR[{args.learn_rate};{args.learn_rate_min}]" 130 | else: 131 | prefix = args.local_model 132 | 133 | print(f"Model prefix:\t{prefix}") 134 | if torch.cuda.is_available(): 135 | print(f"DEVICE\tAvailable GPU has {torch.cuda.device_count()} devices, using {torch.cuda.get_device_name(0)}") 136 | print(f"DEVICE\tCPU has {torch.get_num_threads()} threads") 137 | else: 138 | print(f"DEVICE\tNo GPU available, using the CPU with {torch.get_num_threads()} threads instead.") 139 | 140 | original_model = original_usa_model if args.usa_model else original_worldwide_model 141 | 142 | dataloader = TwitterDataloader(args.dataset, 143 | args.features, 144 | target_columns, 145 | BertTokenizer.from_pretrained(original_model), 146 | args.seed, 147 | args.scale_coord, 148 | val_f) 149 | 150 | # no settings run to save filtered by condition dataset copy 151 | # dataloader.filter_dataset("code", None, ["CA", 'FR', 'GB']) 152 | 153 | trainer = ModelTrainer(prefix, 154 | dataloader, 155 | args.nepochs, 156 | args.batch_size, 157 | args.outcomes, 158 | args.covariance, 159 | args.weighted, 160 | args.loss_dist, 161 | args.loss_mf, 162 | args.loss_prob, 163 | args.loss_total, 164 | args.learn_rate, 165 | args.learn_rate_min, 166 | original_model) 167 | 168 | # if args.hptune: 169 | # trainer.hp_tuning(args.train_size, 170 | # args.test_ratio, 171 | # param_values, 172 | # args.log_step) 173 | 174 | if args.train: 175 | trainer.finetune(args.train_size, 176 | args.test_ratio, 177 | f"{prefix}.pth", 178 | args.nockp, 179 | args.log_step, 180 | args.scheduler, 181 | args.skip) 182 | 183 | if args.eval: 184 | trainer.eval(args.val_size, 185 | args.threshold, 186 | args.val_size, 187 | args.val_user, 188 | args.train_size) 189 | 190 | 191 | if __name__ == "__main__": 192 | main() 193 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/camambert-test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import re 4 | 5 | from transformers import CamembertModel, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup 6 | 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | import torch 11 | import torch.nn as nn 12 | from torch.utils.data import TensorDataset, DataLoader 13 | from torch.nn.utils.clip_grad import clip_grad_norm 14 | 15 | file = 'ua_dataset.jsonl' 16 | model_name = "camembert-base" 17 | 18 | output_model = 'reg_saved.pth' 19 | 20 | epochs = 3 21 | batch_size = 16 22 | 23 | tokenizer = CamembertTokenizer.from_pretrained(model_name) 24 | coord_scaler = StandardScaler() 25 | 26 | # minimal settings test run 27 | # original to this adaptation code: https://medium.com/@anthony.galtier/fine-tuning-bert-for-a-regression-task-is-a-description-enough-to-predict-a-propertys-list-price-cf97cd7cb98a 28 | # author: Anthony Galtier 29 | 30 | def filter_websites(text): 31 | pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*' 32 | text = re.sub(pattern, '', text) 33 | return text 34 | 35 | 36 | def filter_long_descriptions(tokenizer, descriptions, max_len): 37 | indices = [] 38 | lengths = tokenizer(descriptions, padding=False, 39 | truncation=False, return_length=True)['length'] 40 | for i in range(len(descriptions)): 41 | if lengths[i] <= max_len-2: 42 | indices.append(i) 43 | return indices 44 | 45 | 46 | def create_dataloaders(inputs, masks, labels, batch_size): 47 | input_tensor = torch.tensor(inputs) 48 | mask_tensor = torch.tensor(masks) 49 | labels_tensor = torch.tensor(labels) 50 | dataset = TensorDataset(input_tensor, mask_tensor, 51 | labels_tensor) 52 | dataloader = DataLoader(dataset, batch_size=batch_size, 53 | shuffle=True) 54 | return dataloader 55 | 56 | 57 | class BertRegressor(nn.Module): 58 | def __init__(self, drop_rate=0.2, freeze_camembert=False): 59 | super(BertRegressor, self).__init__() 60 | D_in, D_out = 768, 1 61 | 62 | self.bert = CamembertModel.from_pretrained(model_name, return_dict=True) 63 | self.regressor = nn.Sequential( 64 | nn.Dropout(drop_rate), 65 | nn.Linear(D_in, D_out)) 66 | 67 | def forward(self, input_ids, attention_masks): 68 | 69 | outputs = self.bert(input_ids, attention_masks) 70 | class_label_output = outputs[1] 71 | outputs = self.regressor(class_label_output) 72 | return outputs 73 | 74 | 75 | def train(model, optimizer, scheduler, loss_function, epochs, 76 | train_dataloader, device, clip_value=2): 77 | for epoch in range(epochs): 78 | print("Epoch:", epoch) 79 | print("-----") 80 | best_loss = 1e10 81 | model.train() 82 | for step, batch in enumerate(train_dataloader): 83 | print("Step:", step) 84 | batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch) 85 | model.zero_grad() 86 | outputs = model(batch_inputs, batch_masks) 87 | loss = loss_function(outputs.squeeze(), 88 | batch_labels.squeeze()) 89 | loss.backward() 90 | clip_grad_norm(model.parameters(), clip_value) 91 | optimizer.step() 92 | scheduler.step() 93 | 94 | return model 95 | 96 | 97 | def evaluate(model, loss_function, test_dataloader, device): 98 | model.eval() 99 | test_loss, test_r2 = [], [] 100 | for batch in test_dataloader: 101 | batch_inputs, batch_masks, batch_labels = tuple(b.to(device) for b in batch) 102 | with torch.no_grad(): 103 | outputs = model(batch_inputs, batch_masks) 104 | loss = loss_function(outputs, batch_labels) 105 | test_loss.append(loss.item()) 106 | r2 = r2_score(outputs, batch_labels) 107 | test_r2.append(r2.item()) 108 | return test_loss, test_r2 109 | 110 | 111 | def r2_score(outputs, labels): 112 | labels_mean = torch.mean(labels) 113 | ss_tot = torch.sum((labels - labels_mean) ** 2) 114 | ss_res = torch.sum((labels - outputs) ** 2) 115 | r2 = 1 - ss_res / ss_tot 116 | return r2 117 | 118 | 119 | def predict(model, dataloader, device): 120 | model.eval() 121 | output = [] 122 | for batch in dataloader: 123 | batch_inputs, batch_masks, _ = tuple(b.to(device) for b in batch) 124 | with torch.no_grad(): 125 | res = model(batch_inputs, 126 | batch_masks) 127 | #print(res) 128 | #print(res.view(1,-1).tolist()[0]) 129 | output += res.view(1,-1).tolist()[0] 130 | #print(output) 131 | return output 132 | 133 | 134 | def pretraining(model): 135 | model = train(model, optimizer, scheduler, loss_function, epochs, 136 | train_dataloader, device, clip_value=2) 137 | 138 | def save(model, optimizer): 139 | torch.save({ 140 | 'model_state_dict': model.state_dict(), 141 | 'optimizer_state_dict': optimizer.state_dict() 142 | }, output_model) 143 | 144 | print(evaluate(model, loss_function, test_dataloader, device)) 145 | 146 | save(model, optimizer) 147 | 148 | 149 | def evalueting(model): 150 | checkpoint = torch.load(output_model, map_location='cpu') 151 | model.load_state_dict(checkpoint['model_state_dict']) 152 | optimizer.load_state_dict(checkpoint['optimizer_state_dict']) 153 | 154 | val_set = val_data 155 | 156 | encoded_val_corpus = tokenizer(text=val_set.clear_text.tolist(), 157 | add_special_tokens=True, 158 | padding='max_length', 159 | truncation='longest_first', 160 | max_length=300, 161 | return_attention_mask=True) 162 | 163 | val_input_ids = np.array(encoded_val_corpus['input_ids']) 164 | val_attention_mask = np.array(encoded_val_corpus['attention_mask']) 165 | val_labels = val_set.longitude.to_numpy().astype(np.float32) 166 | val_labels = coord_scaler.transform(val_labels.reshape(-1, 1)) 167 | val_dataloader = create_dataloaders(val_input_ids, 168 | val_attention_mask, val_labels, batch_size) 169 | 170 | y_pred_scaled = predict(model, val_dataloader, device) 171 | print(y_pred_scaled) 172 | 173 | y_test = val_set.longitude.to_numpy() 174 | y_pred = coord_scaler.inverse_transform(np.asarray(y_pred_scaled, dtype=np.float32).reshape(-1, 1)) 175 | 176 | print(y_pred) 177 | 178 | for i in range(len(y_test)): 179 | print(y_test[i], y_pred[i][0]) 180 | 181 | from sklearn.metrics import mean_absolute_error 182 | from sklearn.metrics import median_absolute_error 183 | from sklearn.metrics import mean_squared_error 184 | from sklearn.metrics import mean_absolute_percentage_error 185 | from sklearn.metrics import r2_score 186 | 187 | mae = mean_absolute_error(y_test, y_pred) 188 | mdae = median_absolute_error(y_test, y_pred) 189 | mse = mean_squared_error(y_test, y_pred) 190 | mape = mean_absolute_percentage_error(y_test, y_pred) 191 | #mdape = ((pd.Series(y_test) - pd.Series(y_pred)) / pd.Series(y_test)).abs().median() 192 | r_squared = r2_score(y_test, y_pred) 193 | 194 | print(mae, mdae, mse, mape, r_squared) 195 | 196 | 197 | data = pd.read_json(path_or_buf=file, lines=True) 198 | print(data.head()) 199 | print(data.info()) 200 | 201 | data['clear_text'] = data.texts.apply(filter_websites) 202 | 203 | train_data = data.iloc[100:, :] 204 | val_data = data.iloc[:100, :] 205 | 206 | df = train_data 207 | print(df.info()) 208 | 209 | encoded_corpus = tokenizer(text=df.clear_text.tolist(), 210 | add_special_tokens=True, 211 | padding='max_length', 212 | truncation='longest_first', 213 | max_length=300, 214 | return_attention_mask=True) 215 | 216 | input_ids = encoded_corpus['input_ids'] 217 | attention_mask = encoded_corpus['attention_mask'] 218 | 219 | short_descriptions = filter_long_descriptions(tokenizer, df.clear_text.tolist(), 300) 220 | input_ids = np.array(input_ids)[short_descriptions] 221 | attention_mask = np.array(attention_mask)[short_descriptions] 222 | labels = df.longitude.to_numpy()[short_descriptions].astype(np.float32) 223 | 224 | test_size = 0.1 225 | seed = 42 226 | 227 | train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, 228 | labels, 229 | test_size=test_size, 230 | random_state=seed) 231 | 232 | train_masks, test_masks, _, _ = train_test_split(attention_mask, 233 | labels, 234 | test_size=test_size, 235 | random_state=seed) 236 | 237 | 238 | coord_scaler.fit(train_labels.reshape(-1, 1)) 239 | 240 | train_labels = coord_scaler.transform(train_labels.reshape(-1, 1)) 241 | test_labels = coord_scaler.transform(test_labels.reshape(-1, 1)) 242 | 243 | train_dataloader = create_dataloaders(train_inputs, train_masks, 244 | train_labels, batch_size) 245 | test_dataloader = create_dataloaders(test_inputs, test_masks, 246 | test_labels, batch_size) 247 | 248 | model = BertRegressor(drop_rate=0.2) 249 | 250 | if torch.cuda.is_available(): 251 | device = torch.device("cuda") 252 | print("Using GPU.") 253 | else: 254 | print("No GPU available, using the CPU instead.") 255 | device = torch.device("cpu") 256 | 257 | model.to(device) 258 | 259 | 260 | optimizer = AdamW(model.parameters(), 261 | lr=5e-5, 262 | eps=1e-8) 263 | 264 | 265 | total_steps = len(train_dataloader) * epochs 266 | scheduler = get_linear_schedule_with_warmup(optimizer, 267 | num_warmup_steps=0, num_training_steps=total_steps) 268 | 269 | loss_function = nn.MSELoss() 270 | 271 | #pretraining(model) 272 | evalueting(model) 273 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/coords_plots.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import matplotlib.cm as cm 5 | import scipy.stats as st 6 | from mpl_toolkits.basemap import Basemap 7 | from matplotlib.colors import LinearSegmentedColormap 8 | from sklearn.mixture import GaussianMixture, BayesianGaussianMixture 9 | from matplotlib.patches import Ellipse 10 | from matplotlib.colors import LogNorm 11 | from sklearn.preprocessing import MinMaxScaler 12 | 13 | 14 | def load_jsonl(filename): 15 | filename = f"datasets/{filename}" 16 | print(f"DATASET\tLOAD\tLoading dataset from {filename}") 17 | data = pd.read_json(path_or_buf=filename, lines=True) 18 | print(f"DATASET\tLOAD\tDataset of {len(data.index)} coords is loaded") 19 | return data 20 | 21 | 22 | def save_df(data, filename): 23 | with open(filename, "w") as f: 24 | data.to_json(f, orient='records', lines=True) 25 | print(f"VAL\tSAVE\tEstimated data of {len(data.index)} coords is written to file: {filename}") 26 | 27 | # coords = load_jsonl(filename) 28 | # 29 | # #print(coords) 30 | # test = coords[::100] 31 | # print(test) 32 | # 33 | # dens_u = sm.nonparametric.KDEMultivariate(data=[coords["longitude"], coords["latitude"]], var_type='cc') 34 | # print(dens_u) 35 | # print(dens_u.bw) 36 | # 37 | # test["density"] = dens_u.pdf(test) 38 | # save_df(test, "datasets/test.jsonl") 39 | # #print(dens_test) 40 | # 41 | # x = test["longitude"] 42 | # y = test["latitude"] 43 | # #plt.pcolormesh([x, y], dens_test, shading='auto') 44 | # #plt.show() 45 | 46 | 47 | # Kernel Density Estimation surface on map 48 | def kde(coords): 49 | x, y = coords["longitude"], coords["latitude"] 50 | 51 | xmin, xmax = -180, 180 52 | ymin, ymax = -90, 90 53 | 54 | xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j] 55 | print(f"KDE for {len(xx)**2} points is calculating") 56 | 57 | positions = np.vstack([xx.ravel(), yy.ravel()]) 58 | values = np.vstack([x, y]) 59 | kernel = st.gaussian_kde(values) 60 | # f = np.reshape(kernel(positions).T, xx.shape) 61 | # kde = pd.DataFrame(f) 62 | # save_df(kde, f"datasets/{kde_results}") 63 | f = load_jsonl(kde_results) 64 | print(f) 65 | 66 | fig = plt.figure(figsize=(20, 15)) 67 | ax = plt.axes(projection='3d') 68 | 69 | ncolors = 256 70 | color_array = plt.get_cmap('rainbow')(range(ncolors)) 71 | color_array[:,-1] = np.linspace(0.2,1.0,ncolors) 72 | map_object = LinearSegmentedColormap.from_list(name='rainbow_alpha',colors=color_array) 73 | plt.register_cmap(cmap=map_object) 74 | 75 | surf = ax.plot_surface(xx, yy, f, rstride=1, cstride=1, cmap='rainbow_alpha', edgecolor='none') 76 | ax.contour(xx, yy, f, zdir='z', offset=0, cmap=cm.coolwarm) 77 | 78 | map = Basemap(fix_aspect=False) 79 | ax.add_collection3d(map.drawcoastlines(linewidth=0.25)) 80 | ax.add_collection3d(map.drawcountries(linewidth=0.35)) 81 | 82 | ax.set_yticks(range(-90, 90, 30)) 83 | ax.set_xticks(range(-180, 180, 30)) 84 | 85 | ax.set_box_aspect((4, 3, 1)) 86 | ax.set_xlabel('longitude') 87 | ax.set_ylabel('latitude') 88 | ax.set_zlabel('PDF') 89 | ax.set_title('Surface plot of Gaussian 2D KDE for 200x200 points estimated from worldwide tweets 2022') 90 | fig.colorbar(surf, shrink=0.5, aspect=10, location='left') # add color bar indicating the PDF 91 | ax.view_init(20, -60) 92 | 93 | pic = f"results/img/kde_test_world.png" 94 | 95 | #fig.tight_layout() 96 | plt.savefig(pic, dpi=600) 97 | print(f"VAL\tSAVE\tPlot of {len(f.index)} samples is drawn to file: {pic}") 98 | plt.show() 99 | 100 | 101 | # GMM clustering of point on the map 102 | def gmm(coords, peaks, seed): 103 | X = coords.to_numpy(dtype=float) 104 | xmin, xmax = -180., 180. 105 | ymin, ymax = -90., 90. 106 | print(X) 107 | 108 | fig, ax = plt.subplots(figsize=(20, 10)) 109 | ax.set_xlim(xmin, xmax) 110 | ax.set_ylim(ymin, ymax) 111 | map = Basemap() 112 | map.drawcoastlines(linewidth=0.5, color="black") 113 | map.drawcountries(linewidth=0.7, color="black") 114 | map.drawparallels(np.arange(ymin, ymax, 30.)) 115 | map.drawmeridians(np.arange(xmin, xmax, 30.)) 116 | map.drawmapboundary(fill_color='azure') 117 | map.fillcontinents(color='white', lake_color='azure') 118 | 119 | print(f"Calculating clusters of {X.shape[0]} points from GMM with {peaks} means and random seed {seed}") 120 | gmm = GaussianMixture(n_components=peaks, covariance_type='full', random_state=seed).fit(X) 121 | labels = gmm.predict(X) 122 | probs = gmm.predict_proba(X) 123 | size = probs.max(1)**2 124 | 125 | map.scatter(X[:, 0], X[:, 1], c=labels, s=size, cmap="turbo", zorder=5) 126 | #plt.scatter(X[:, 0], X[:, 1], s=4, c="black") 127 | 128 | w_factor = 0.2 / gmm.weights_.max() 129 | for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): 130 | U, s, Vt = np.linalg.svd(covar) 131 | angle = np.degrees(np.arctan2(U[1, 0], U[0, 0])) 132 | width, height = 2 * np.sqrt(s) 133 | for nsig in range(1, 4): 134 | ax.add_patch(Ellipse(pos, nsig * width, nsig * height, angle, 135 | alpha=w * w_factor, color="black")) 136 | 137 | plt.title(f'Scatter plot of coordinated') 138 | plt.xlabel('Longitude') 139 | plt.ylabel('Latitude') 140 | plt.show() 141 | 142 | 143 | # BIC and AIC criterion for GMMs of different peaks number 144 | def gmm_crit(coords, start, end, step, seed): 145 | X = coords.to_numpy(dtype=float) 146 | 147 | fig, ax = plt.subplots(figsize=(20, 10)) 148 | 149 | models = [] 150 | n_components = [] 151 | for n in range(start, end, step): 152 | print(f"Calculating model with {n} means") 153 | n_components.append(n) 154 | models.append(GaussianMixture(n_components=n, 155 | covariance_type='full', 156 | verbose=1, 157 | random_state=seed).fit(X)) 158 | 159 | plt.plot(n_components, [m.bic(X) for m in models], label='BIC') 160 | plt.plot(n_components, [m.aic(X) for m in models], label='AIC') 161 | plt.legend(loc='best') 162 | plt.xlabel('n_components') 163 | #plt.show() 164 | 165 | f = f"results/img/gmm.png" 166 | plt.savefig(f) 167 | 168 | 169 | # write GMM to jsonl file 170 | def save_gmm(gmm, filename): 171 | gmm_df = pd.DataFrame(columns=["weights", "means", "covariances", "precisions", "precisions_cholesky"]) 172 | gmm_df[["means", "covariances", "precisions", "precisions_cholesky"]] = gmm_df[["means", "covariances", "precisions", "precisions_cholesky"]].astype(object) 173 | gmm_df["weights"] = gmm.weights_ 174 | 175 | weights = gmm.weights_ 176 | print('2 dec - Estimated number of clusters: ' + str((np.round(weights, 2) > 0).sum())) 177 | print('3 dec - Estimated number of clusters: ' + str((np.round(weights, 3) > 0).sum())) 178 | print('4 dec - Estimated number of clusters: ' + str((np.round(weights, 4) > 0).sum())) 179 | print('5 dec - Estimated number of clusters: ' + str((np.round(weights, 5) > 0).sum())) 180 | 181 | for i in range(len(gmm.covariances_)): 182 | gmm_df.at[i, "means"] = np.array(gmm.means_[i]) 183 | gmm_df.at[i, "covariances"] = np.array(gmm.covariances_[i]) 184 | gmm_df.at[i, "precisions"] = np.array(gmm.precisions_[i]) 185 | gmm_df.at[i, "precisions_cholesky"] = np.array(gmm.precisions_cholesky_[i]) 186 | #print(gmm_df) 187 | 188 | with open(filename, "w") as f: 189 | gmm_df.to_json(f, orient='records', lines=True) 190 | print(f"PARAM\tSAVE\tParameters for GMM with {len(gmm_df.index)} means are written to file: {filename}") 191 | 192 | 193 | # read GMM from jsonl file 194 | def load_gmm(filename): 195 | data = pd.read_json(path_or_buf=filename, lines=True) 196 | #print(data) 197 | print(f"PARAM\tLOAD\tParameters for GMM with {len(data.index)} means are loaded") 198 | means, covs, prec, prec_chol = [], [], [], [] 199 | for i in range(len(data.index)): 200 | means.append(np.array(data.at[i, "means"])) 201 | covs.append(np.array(data.at[i, "covariances"])) 202 | prec.append(np.array(data.at[i, "precisions"])) 203 | prec_chol.append(np.array(data.at[i, "precisions_cholesky"])) 204 | 205 | print(f"MODEL\tINIT\tInitialization of GMM with {len(data.index)} means") 206 | gmm = GaussianMixture(n_components=len(data.index), covariance_type='full', max_iter=1, verbose=0, random_state=seed) 207 | gmm.weights_ = data["weights"].values 208 | 209 | weights = gmm.weights_ 210 | print('2 dec - Estimated number of clusters: ' + str((np.round(weights, 2) > 0).sum())) 211 | print('3 dec - Estimated number of clusters: ' + str((np.round(weights, 3) > 0).sum())) 212 | print('4 dec - Estimated number of clusters: ' + str((np.round(weights, 4) > 0).sum())) 213 | print('5 dec - Estimated number of clusters: ' + str((np.round(weights, 5) > 0).sum())) 214 | 215 | gmm.means_ = np.array(means) 216 | gmm.covariances_ = np.array(covs) 217 | gmm.precisions_ = np.array(prec) 218 | gmm.precisions_cholesky_ = np.array(prec_chol) 219 | 220 | print(f"MODEL\tSET\tParameters of GMM with {len(data.index)} means are set") 221 | return gmm 222 | 223 | 224 | # GMM clustering of point on the map 225 | def plot_gmm(gmm, coords): 226 | X = coords.to_numpy(dtype=float) 227 | xmin, xmax = -180., 180. 228 | ymin, ymax = -90., 90. 229 | #print(X) 230 | 231 | fig, ax = plt.subplots(figsize=(20, 10)) 232 | ax.set_xlim(xmin, xmax) 233 | ax.set_ylim(ymin, ymax) 234 | map = Basemap() 235 | map.drawcoastlines(linewidth=0.5, color="black") 236 | map.drawcountries(linewidth=0.7, color="black") 237 | map.drawparallels(np.arange(ymin, ymax, 30.)) 238 | map.drawmeridians(np.arange(xmin, xmax, 30.)) 239 | map.drawmapboundary(fill_color='azure') 240 | map.fillcontinents(color='white', lake_color='azure') 241 | 242 | labels = gmm.predict(X) 243 | probs = gmm.predict_proba(X) 244 | size = probs.max(1)**2 245 | 246 | map.scatter(X[:, 0], X[:, 1], c=labels, s=size, cmap="turbo", zorder=5) 247 | 248 | #w_factor = 0.9 / gmm.weights_.max() 249 | for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_): 250 | U, s, Vt = np.linalg.svd(covar) 251 | angle = np.degrees(np.arctan2(U[1, 0], U[0, 0])) 252 | width, height = 2 * np.sqrt(s) 253 | for nsig in range(1, 4): 254 | ax.add_patch(Ellipse(pos, nsig * width, nsig * height, angle, 255 | color="black", alpha=0.05)) 256 | 257 | plt.title(f'Scatter plot of coordinates') 258 | plt.xlabel('Longitude') 259 | plt.ylabel('Latitude') 260 | plt.show() 261 | 262 | f = f"results/img/gmm_plot.png" 263 | plt.savefig(f) 264 | 265 | 266 | # fitting GMM to coords data 267 | def calc_gmm(coords, peaks, seed, iter, gmm_filename): 268 | X = coords.to_numpy(dtype=float) 269 | print(f"Calculating GMM with {peaks} means for max {iter} iterations") 270 | gmm = GaussianMixture(n_components=peaks, 271 | covariance_type='full', 272 | verbose=1, 273 | n_init=1, 274 | max_iter=iter, 275 | random_state=seed).fit(X) 276 | save_gmm(gmm, gmm_filename) 277 | 278 | 279 | # fitting Bayesian GMM to coords data 280 | def calc_bgmm(coords, peaks, seed, iter, bgmm_filename): 281 | X = coords.to_numpy(dtype=float) 282 | print(f"Calculating BGMM with {peaks} means for max {iter} iterations") 283 | bgmm = BayesianGaussianMixture(n_components=peaks, 284 | covariance_type='full', 285 | verbose=1, 286 | n_init=1, 287 | max_iter=iter, 288 | random_state=seed).fit(X) 289 | save_gmm(bgmm, bgmm_filename) 290 | 291 | 292 | # generate grid by number of steps 293 | def map_grid(peaks, step=200): 294 | xmin, xmax = -180, 180 295 | ymin, ymax = -90, 90 296 | x = np.linspace(xmin, xmax, step) 297 | y = np.linspace(ymin, ymax, step) 298 | x = np.concatenate((x, peaks[:, 0]), axis=0) 299 | x = np.sort(x) 300 | y = np.concatenate((y, peaks[:, 1]), axis=0) 301 | y = np.sort(y) 302 | xx, yy = np.meshgrid(x, y) 303 | #xx, yy = np.meshgrid(peaks[:, 0], peaks[:, 1]) 304 | return xx, yy 305 | 306 | 307 | # GMM likelihood score surface plot on the map (shifted to min as 0) 308 | def gmm_likelihood(gmm): 309 | xx, yy = map_grid(gmm.means_, 100) 310 | XX = np.array([xx.ravel(), yy.ravel()]).T 311 | 312 | print(f"Calculating scores from GMM for {len(XX)} points") 313 | Z = gmm.score_samples(XX) 314 | zmin = np.min(Z) 315 | zmax = np.max(Z) 316 | print(f"Original\tMin: {zmin}\tMax: {zmax}") 317 | Z = Z - np.min(Z) 318 | zmin = np.min(Z) 319 | zmax = np.max(Z) 320 | print(f"Adjusted\tMin: {zmin}\tMax: {zmax}") 321 | #Z = np.exp(Z) 322 | Z = Z.reshape(xx.shape) 323 | #print(Z) 324 | 325 | # P = gmm.predict_proba(XX) 326 | # S = P.max(1) 327 | # S = S.reshape(xx.shape) 328 | # print(S) 329 | # scaler = MinMaxScaler((0, 100)) 330 | # Z = scaler.fit_transform(Z) 331 | # print(Z) 332 | 333 | fig = plt.figure(figsize=(20, 15)) 334 | ax = plt.axes(projection='3d') 335 | 336 | ncolors = 256 337 | color_array = plt.get_cmap('rainbow')(range(ncolors)) 338 | color_array[:,-1] = np.linspace(0.2,1.0,ncolors) 339 | map_object = LinearSegmentedColormap.from_list(name='rainbow_alpha',colors=color_array) 340 | plt.register_cmap(cmap=map_object) 341 | 342 | surf = ax.plot_surface(xx, yy, Z, rstride=1, cstride=1, cmap='rainbow_alpha', edgecolor='none') 343 | contour = ax.contour(xx, yy, Z, levels=np.linspace(zmin, zmax, 500), zdir='z', offset=0, cmap='rainbow_alpha') 344 | 345 | map = Basemap(fix_aspect=False) 346 | ax.add_collection3d(map.drawcoastlines(linewidth=0.25)) 347 | ax.add_collection3d(map.drawcountries(linewidth=0.35)) 348 | 349 | ax.set_yticks(range(-90, 90, 30)) 350 | ax.set_xticks(range(-180, 180, 30)) 351 | ax.set_zticks(range(int(zmin), int(zmax), 5)) 352 | 353 | ax.set_box_aspect((4, 3, 1)) 354 | ax.set_xlabel('longitude') 355 | ax.set_ylabel('latitude') 356 | ax.set_zlabel('Log-likelihood') 357 | ax.set_title(f'Surface plot of likelihood from GMM with {len(gmm.weights_)} peaks for {len(XX)} points estimated from worldwide tweets 2022') 358 | fig.colorbar(surf, shrink=0.5, aspect=10, location='left') # add color bar indicating the likelihood 359 | ax.view_init(30, -60) 360 | 361 | pic = f"results/img/gmm_likelihood_world.png" 362 | 363 | #fig.tight_layout() 364 | plt.savefig(pic, dpi=600) 365 | print(f"PLOT\tSAVE\tPlot of GMM likelihood for {len(XX)} points is drawn to file: {pic}") 366 | plt.show() 367 | 368 | 369 | # GMM PDF surface plot on the map 370 | def gmm_density(gmm): 371 | xmin, xmax = -180, 180 372 | ymin, ymax = -90, 90 373 | xx, yy = np.mgrid[xmin:xmax:200j, ymin:ymax:200j] 374 | XX = np.array([xx.ravel(), yy.ravel()]).T 375 | 376 | print(f"Calculating scores from GMM for {len(XX)} points") 377 | Z = gmm.score_samples(XX) 378 | zmin = np.min(Z) 379 | zmax = np.max(Z) 380 | print(f"Original\tMin: {zmin}\tMax: {zmax}") 381 | Z = np.exp(Z) 382 | zmin = np.min(Z) 383 | zmax = np.max(Z) 384 | print(f"Probability\tMin: {zmin}\tMax: {zmax}") 385 | Z = Z.reshape(xx.shape) 386 | #print(Z) 387 | 388 | fig = plt.figure(figsize=(20, 15)) 389 | ax = plt.axes(projection='3d') 390 | 391 | ncolors = 256 392 | color_array = plt.get_cmap('rainbow')(range(ncolors)) 393 | color_array[:,-1] = np.linspace(0.2,1.0,ncolors) 394 | map_object = LinearSegmentedColormap.from_list(name='rainbow_alpha',colors=color_array) 395 | plt.register_cmap(cmap=map_object) 396 | 397 | surf = ax.plot_surface(xx, yy, Z, rstride=1, cstride=1, cmap='rainbow_alpha', edgecolor='none') 398 | contour = ax.contour(xx, yy, Z, levels=np.linspace(zmin, zmax, 500), zdir='z', offset=0, cmap='rainbow_alpha') 399 | 400 | map = Basemap(fix_aspect=False) 401 | ax.add_collection3d(map.drawcoastlines(linewidth=0.25)) 402 | ax.add_collection3d(map.drawcountries(linewidth=0.35)) 403 | 404 | ax.set_yticks(range(-90, 90, 30)) 405 | ax.set_xticks(range(-180, 180, 30)) 406 | #ax.set_zticks(range(int(zmin), int(zmax))) 407 | 408 | ax.set_box_aspect((4, 3, 1)) 409 | ax.set_xlabel('longitude') 410 | ax.set_ylabel('latitude') 411 | ax.set_zlabel('Probability') 412 | ax.set_title(f'Surface plot of probability from GMM with {len(gmm.weights_)} peaks for {len(XX)} points estimated from worldwide tweets 2022') 413 | fig.colorbar(surf, shrink=0.5, aspect=10, location='left') # add color bar indicating the probability 414 | ax.view_init(10, -60) 415 | 416 | pic = f"results/img/gmm_probability_world.png" 417 | 418 | #fig.tight_layout() 419 | plt.savefig(pic, dpi=600) 420 | print(f"PLOT\tSAVE\tPlot of GMM probability for {len(XX)} points is drawn to file: {pic}") 421 | plt.show() 422 | 423 | 424 | # GMM PDF contour plot on the map 425 | def gmm_contour(gmm): 426 | xmin, xmax = -180, 180 427 | ymin, ymax = -90, 90 428 | xx, yy = map_grid(gmm.means_, 200) 429 | XX = np.array([xx.ravel(), yy.ravel()]).T 430 | 431 | print(f"Calculating scores from GMM for {len(XX)} points") 432 | Z = gmm.score_samples(XX) 433 | zmin = np.min(Z) 434 | zmax = np.max(Z) 435 | print(f"Original\tMin: {zmin}\tMax: {zmax}") 436 | Z = Z.reshape(xx.shape) 437 | 438 | fig, ax = plt.subplots(figsize=(20, 10)) 439 | ax.set_xlim(xmin, xmax) 440 | ax.set_ylim(ymin, ymax) 441 | 442 | contour = ax.contour(xx, yy, Z, levels=np.linspace(zmin, zmax, 300), cmap='RdYlGn_r', linewidths=0.5) 443 | 444 | map = Basemap() 445 | map.drawcoastlines(linewidth=0.5, color="black") 446 | map.drawcountries(linewidth=0.7, color="black") 447 | map.drawparallels(np.arange(ymin, ymax, 30.)) 448 | map.drawmeridians(np.arange(xmin, xmax, 30.)) 449 | map.drawmapboundary(fill_color='azure') 450 | map.fillcontinents(color='white', lake_color='azure') 451 | 452 | peaks = gmm.means_ 453 | map.scatter(peaks[:, 0], peaks[:, 1], c=gmm.weights_, cmap="RdYlGn_r", s=0.7, zorder=5) 454 | 455 | plt.title(f'Contour plot of likelihood from GMM with {len(gmm.weights_)} peaks for {len(XX)} points estimated from worldwide tweets 2022') 456 | fig.colorbar(contour, shrink=0.5, aspect=10, location='left') # add color bar indicating the likelihood 457 | plt.xlabel('Longitude') 458 | plt.ylabel('Latitude') 459 | 460 | pic = f"results/img/gmm_contour_plot.png" 461 | plt.savefig(pic, dpi=600) 462 | print(f"PLOT\tSAVE\tPlot of GMM probability for {len(XX)} points is drawn to file: {pic}") 463 | 464 | plt.show() 465 | 466 | 467 | filename = "twitter-2020-02-28.txt" 468 | world = "map-world/world-twitter-2022-coords.jsonl" 469 | kde_results = "kde_world_2022.jsonl" 470 | 471 | coords = load_jsonl(filename) 472 | print(coords) 473 | 474 | peaks = 1000 475 | seed = 42 476 | iter = 1000 477 | 478 | gmm_filename = f"datasets/gmm-p{peaks}-c{len(coords.index)}.jsonl" 479 | bgmm_filename = f"datasets/bgmm-p{peaks}-c{len(coords.index)}.jsonl" 480 | gmm_200 = "datasets/200-gmm.jsonl" 481 | bgmm_cluser = "datasets/bgmm-p200-c12057022.jsonl" 482 | 483 | #calc_gmm(coords, peaks, seed, iter, gmm_filename) 484 | #calc_bgmm(coords, peaks, seed, iter, bgmm_filename) 485 | 486 | # gmm = load_gmm(bgmm_cluser) 487 | X = coords[["longitude", "latitude"]].to_numpy(dtype=float) 488 | # Z = gmm.score_samples(X) 489 | # print(Z.min(), Z.max(), Z.mean()) 490 | # gmm = load_gmm(gmm_200) 491 | # Z = gmm.score_samples(X) 492 | # print(Z.min(), Z.max(), Z.mean()) 493 | # gmm_likelihood(gmm) 494 | # gmm_density(gmm) 495 | # gmm_contour(gmm) 496 | 497 | 498 | 499 | # test for differences in covariance 500 | # 501 | # from scipy.linalg import cholesky 502 | # 503 | # cov = "spherical" 504 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X) 505 | # print(cov) 506 | # print(gmm.covariances_) 507 | # print(gmm.precisions_) 508 | # print(gmm.precisions_cholesky_) 509 | # print(cholesky(gmm.covariances_)) 510 | # 511 | # cov = "diag" 512 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X) 513 | # print(cov) 514 | # print(gmm.covariances_) 515 | # print(gmm.precisions_) 516 | # print(gmm.precisions_cholesky_) 517 | # 518 | # cov = "full" 519 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X) 520 | # print(cov) 521 | # print(gmm.covariances_) 522 | # print(gmm.precisions_) 523 | # print(gmm.precisions_cholesky_) 524 | # 525 | # cov = "tied" 526 | # gmm = GaussianMixture(5, covariance_type=cov, max_iter=1).fit(X) 527 | # print(cov) 528 | # print(gmm.covariances_) 529 | # print(gmm.precisions_) 530 | # print(gmm.precisions_cholesky_) 531 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/data-from-test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import string 4 | from sklearn.model_selection import train_test_split 5 | # df = pd.read_csv("datasets/full_text.txt", 6 | # delimiter="\t", 7 | # header=None, 8 | # encoding='latin-1') 9 | # df.columns = ["user", "time", "x", "lat", "lon", "text"] 10 | # print(len(df['user'].unique())) 11 | # print(df.info()) 12 | # tdf = df.sample(n=300000, random_state=42) 13 | # print(len(tdf['user'].unique())) 14 | # df = df.drop(df.sample(n=300000, random_state=42).index, axis=0) 15 | # edf = df.sample(n=76000, random_state=42) 16 | # print(df.info()) 17 | # print(len(edf['user'].unique())) 18 | df = pd.read_json(path_or_buf="datasets/worldwide-twitter-day.jsonl", lines=True) 19 | print(len(df['user'].unique())) 20 | print(df.user.mode()) 21 | n = 100 22 | users = df['user'].value_counts()[:n].index.tolist() 23 | 24 | for u in users: 25 | print((df.user == u).sum(), u) 26 | 27 | user_df = df[df['user'].isin(users)] 28 | with open("datasets/worldwide-twitter-day-users.jsonl", "w") as f: 29 | user_df.to_json(f, orient='records', lines=True) 30 | 31 | 32 | # users = [y for x, y in df.groupby('user')] 33 | # 34 | # train_users, test_users = train_test_split(users, test_size=0.2, random_state=42) 35 | # train_users, test_users = list(train_users), list(test_users) 36 | # train_size, test_size = len(train_users), len(test_users) 37 | # 38 | # train_df = pd.concat(train_users) 39 | # test_df = pd.concat(test_users) 40 | # print(train_df.info()) 41 | # print(len(train_df['user'].unique())) 42 | # print(test_df.info()) 43 | # print(len(test_df['user'].unique())) 44 | # 45 | 46 | # with open("datasets/eisenstein_test.jsonl", "w") as f: 47 | # test_df.to_json(f, orient='records', lines=True) 48 | 49 | 50 | # print(len(df["user"].unique())) 51 | # df_by_user = pd.DataFrame(columns=["lon", "lat", "text", "user"]) 52 | # df_by_user["user"] = df["user"].unique() 53 | # print(df_by_user.info()) 54 | # print(df_by_user) 55 | # for i in range(len(df_by_user.index)): 56 | # user = df_by_user.loc[df_by_user.index[i], "user"] 57 | # user_texts = df.loc[df['user'] == user] 58 | # df_by_user.loc[df_by_user.index[i], "text"] = '. '.join(user_texts["text"].astype(str)) 59 | # df_by_user.loc[df_by_user.index[i], "lon"] = user_texts.loc[user_texts.index[0], "lon"] 60 | # df_by_user.loc[df_by_user.index[i], "lat"] = user_texts.loc[user_texts.index[0], "lat"] 61 | # 62 | # print(df) 63 | # 64 | 65 | 66 | # def chunks(lst, n): 67 | # """Yield successive n-sized chunks from lst.""" 68 | # for i in range(0, len(lst), n): 69 | # yield lst[i:i + n] 70 | # 71 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user.jsonl", lines=True) 72 | # print(df.info()) 73 | # print(df["text"]) 74 | # lengths = df["text"].str.len() 75 | # print(lengths) 76 | # count = df['text'].str.split().str.len() 77 | # print(count) 78 | # 79 | # df_split = pd.DataFrame(columns=["lon", "lat", "text", "user"]) 80 | # for i in range(len(df.index)): 81 | # texts = df.loc[df.index[i], "text"] 82 | # words = len(texts.split()) 83 | # tw = texts.split() 84 | # if words // 300 > 0: 85 | # n = words//300 + 1 86 | # size = words//n + 1 87 | # splitted = [tw[i:i + size] for i in range(0, len(tw), size)] 88 | # for k in range(len(splitted)): 89 | # df_short = df.iloc[i] 90 | # splitted[k] = " ".join(splitted[k]) 91 | # df_short["text"] = splitted[k] 92 | # df_split = df_split.append(df_short, ignore_index=True) 93 | # else: 94 | # df_split = df_split.append(df.iloc[i], ignore_index=True) 95 | # 96 | # 97 | # with open("datasets/eisenstein_user_test.jsonl", "w") as f: 98 | # df_split.to_json(f, orient='records', lines=True) 99 | 100 | # def nlp_filtering(text): 101 | # def filter_punctuation(text): 102 | # punctuationfree="".join([i for i in text if i not in string.punctuation]) 103 | # return punctuationfree 104 | # 105 | # def filter_websites(text): 106 | # #pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*' 107 | # pattern = r'http\S+' 108 | # text = re.sub(pattern, '', text) 109 | # return text 110 | # 111 | # text = filter_websites(text) 112 | # text = filter_punctuation(text) 113 | # return text 114 | # 115 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user_test.jsonl", lines=True) 116 | # print(df.info()) 117 | # print(df["text"]) 118 | # lengths = df["text"].str.len() 119 | # print(lengths) 120 | # count = df['text'].str.split().str.len() 121 | # print(count.max()) 122 | # 123 | # df["text"] = df["text"].apply(nlp_filtering) 124 | # 125 | # count = df['text'].str.split().str.len() 126 | # print(count.max()) 127 | 128 | 129 | # df = pd.read_json(path_or_buf="results/val-data/U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100000_2022-10-29.jsonl", lines=True) 130 | # df = df.drop(df.sample(n=99900, random_state=42).index, axis=0) 131 | # with open("results/val-data/pmop-test.jsonl", "w") as f: 132 | # df.to_json(f, orient='records', lines=True) 133 | 134 | # size = 300000 135 | # vs = 300000 136 | # df = pd.read_json(path_or_buf="datasets/worldwide-twitter-day.jsonl", lines=True) 137 | # print(df.info()) 138 | # print(len(df['lang'].unique())) 139 | # print(len(df['code'].unique())) 140 | # print(len(df['user'].unique())) 141 | # tdf = df.sample(n=size, random_state=42) 142 | # print(len(tdf['lang'].unique())) 143 | # print(len(tdf['code'].unique())) 144 | # print(len(tdf['user'].unique())) 145 | # df = df.drop(df.sample(n=size, random_state=42).index, axis=0) 146 | # edf = df.sample(n=vs, random_state=42) 147 | # print(df.info()) 148 | # print(len(edf['lang'].unique())) 149 | # print(len(edf['code'].unique())) 150 | # print(len(edf['user'].unique())) 151 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/geotext-dataframe.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import string 4 | from sklearn.model_selection import train_test_split 5 | 6 | 7 | # GeoText data from Eisenstein work - forming train and test dataframes 8 | 9 | # df = pd.read_csv("datasets/full_text.txt", 10 | # delimiter="\t", 11 | # header=None, 12 | # encoding='latin-1') 13 | # df.columns = ["user", "time", "x", "lat", "lon", "text"] 14 | # print(len(df['user'].unique())) 15 | # print(df.info()) 16 | # tdf = df.sample(n=300000, random_state=42) 17 | # print(len(tdf['user'].unique())) 18 | # df = df.drop(df.sample(n=300000, random_state=42).index, axis=0) 19 | # edf = df.sample(n=76000, random_state=42) 20 | # print(df.info()) 21 | # print(len(edf['user'].unique())) 22 | 23 | df = pd.read_json(path_or_buf="datasets/eisenstein.jsonl", lines=True) 24 | 25 | users = [y for x, y in df.groupby('user')] 26 | 27 | train_users, test_users = train_test_split(users, test_size=0.2, random_state=42) 28 | train_users, test_users = list(train_users), list(test_users) 29 | train_size, test_size = len(train_users), len(test_users) 30 | 31 | train_df = pd.concat(train_users) 32 | test_df = pd.concat(test_users) 33 | print(train_df.info()) 34 | print(len(train_df['user'].unique())) 35 | print(test_df.info()) 36 | print(len(test_df['user'].unique())) 37 | 38 | with open("datasets/eisenstein_train.jsonl", "w") as f: 39 | train_df.to_json(f, orient='records', lines=True) 40 | with open("datasets/eisenstein_test.jsonl", "w") as f: 41 | test_df.to_json(f, orient='records', lines=True) 42 | 43 | 44 | # print(len(df["user"].unique())) 45 | # df_by_user = pd.DataFrame(columns=["lon", "lat", "text", "user"]) 46 | # df_by_user["user"] = df["user"].unique() 47 | # print(df_by_user.info()) 48 | # print(df_by_user) 49 | # for i in range(len(df_by_user.index)): 50 | # user = df_by_user.loc[df_by_user.index[i], "user"] 51 | # user_texts = df.loc[df['user'] == user] 52 | # df_by_user.loc[df_by_user.index[i], "text"] = '. '.join(user_texts["text"].astype(str)) 53 | # df_by_user.loc[df_by_user.index[i], "lon"] = user_texts.loc[user_texts.index[0], "lon"] 54 | # df_by_user.loc[df_by_user.index[i], "lat"] = user_texts.loc[user_texts.index[0], "lat"] 55 | # 56 | # print(df) 57 | # 58 | 59 | 60 | # def chunks(lst, n): 61 | # """Yield successive n-sized chunks from lst.""" 62 | # for i in range(0, len(lst), n): 63 | # yield lst[i:i + n] 64 | # 65 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user.jsonl", lines=True) 66 | # print(df.info()) 67 | # print(df["text"]) 68 | # lengths = df["text"].str.len() 69 | # print(lengths) 70 | # count = df['text'].str.split().str.len() 71 | # print(count) 72 | # 73 | # df_split = pd.DataFrame(columns=["lon", "lat", "text", "user"]) 74 | # for i in range(len(df.index)): 75 | # texts = df.loc[df.index[i], "text"] 76 | # words = len(texts.split()) 77 | # tw = texts.split() 78 | # if words // 300 > 0: 79 | # n = words//300 + 1 80 | # size = words//n + 1 81 | # splitted = [tw[i:i + size] for i in range(0, len(tw), size)] 82 | # for k in range(len(splitted)): 83 | # df_short = df.iloc[i] 84 | # splitted[k] = " ".join(splitted[k]) 85 | # df_short["text"] = splitted[k] 86 | # df_split = df_split.append(df_short, ignore_index=True) 87 | # else: 88 | # df_split = df_split.append(df.iloc[i], ignore_index=True) 89 | # 90 | # 91 | # with open("datasets/eisenstein_user_test.jsonl", "w") as f: 92 | # df_split.to_json(f, orient='records', lines=True) 93 | 94 | # def nlp_filtering(text): 95 | # def filter_punctuation(text): 96 | # punctuationfree="".join([i for i in text if i not in string.punctuation]) 97 | # return punctuationfree 98 | # 99 | # def filter_websites(text): 100 | # #pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*' 101 | # pattern = r'http\S+' 102 | # text = re.sub(pattern, '', text) 103 | # return text 104 | # 105 | # text = filter_websites(text) 106 | # text = filter_punctuation(text) 107 | # return text 108 | # 109 | # df = pd.read_json(path_or_buf="datasets/eisenstein_user_test.jsonl", lines=True) 110 | # print(df.info()) 111 | # print(df["text"]) 112 | # lengths = df["text"].str.len() 113 | # print(lengths) 114 | # count = df['text'].str.split().str.len() 115 | # print(count.max()) 116 | # 117 | # df["text"] = df["text"].apply(nlp_filtering) 118 | # 119 | # count = df['text'].str.split().str.len() 120 | # print(count.max()) 121 | 122 | 123 | # df = pd.read_json(path_or_buf="results/val-data/U-NON-GEO+GEO-ONLY-O5-d-total_sum-mf_sum-pos_spher-weighted-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N100000_2022-10-29.jsonl", lines=True) 124 | # df = df.drop(df.sample(n=99900, random_state=42).index, axis=0) 125 | # with open("results/val-data/pmop-test.jsonl", "w") as f: 126 | # df.to_json(f, orient='records', lines=True) 127 | 128 | # size = 300000 129 | # vs = 300000 130 | # df = pd.read_json(path_or_buf="datasets/worldwide-twitter-day.jsonl", lines=True) 131 | # print(df.info()) 132 | # print(len(df['lang'].unique())) 133 | # print(len(df['code'].unique())) 134 | # print(len(df['user'].unique())) 135 | # tdf = df.sample(n=size, random_state=42) 136 | # print(len(tdf['lang'].unique())) 137 | # print(len(tdf['code'].unique())) 138 | # print(len(tdf['user'].unique())) 139 | # df = df.drop(df.sample(n=size, random_state=42).index, axis=0) 140 | # edf = df.sample(n=vs, random_state=42) 141 | # print(df.info()) 142 | # print(len(edf['lang'].unique())) 143 | # print(len(edf['code'].unique())) 144 | # print(len(edf['user'].unique())) 145 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/hf_repo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from utils.regressor import * 3 | from utils.result_manager import * 4 | from transformers import Pipeline 5 | from transformers import BertConfig, BertTokenizer 6 | 7 | base_model = "bert-base-multilingual-cased" 8 | 9 | local_model = "P-NON-GEO+GEO-ONLY-O5.pth" 10 | 11 | hub_model = 'k4tel/geo-bert-multilingual' 12 | hf_folder = "models/hf/model" 13 | 14 | local_model = f'models/final/{local_model}' 15 | 16 | 17 | # upload local model and save for the future HF repo upload 18 | def save(local_model, hf_folder, base_model): 19 | config = BertConfig.from_pretrained(base_model) 20 | 21 | feature_outputs = BERTregModel(5, "spher", True, ["NON-GEO", "GEO-ONLY"], base_model).feature_outputs 22 | 23 | custom_model = GeoBertModel(config=config, feature_outputs=feature_outputs) 24 | 25 | if torch.cuda.is_available(): 26 | state = torch.load(local_model) 27 | else: 28 | state = torch.load(local_model, map_location='cpu') 29 | 30 | model_state_dict = state['model_state_dict'] 31 | 32 | custom_model.load_state_dict(model_state_dict) 33 | 34 | tokenizer = BertTokenizer.from_pretrained(base_model) 35 | 36 | custom_model.save_pretrained(hf_folder) 37 | tokenizer.save_pretrained(hf_folder) 38 | 39 | # add all files from hf_folder to the HF repo manually 40 | 41 | 42 | # huggingface framework load from repo + prediction pipeline test 43 | def load(hub_model, base_model): 44 | model_wrapper = BERTregModel(5, "spher", True, ["NON-GEO", "GEO-ONLY"], base_model, hub_model) 45 | benchmark = ModelBenchmark(model_wrapper, True, "pos", "mean", "mean") 46 | 47 | tokenizer = BertTokenizer.from_pretrained(hub_model) 48 | model = model_wrapper.model 49 | 50 | # testing model 51 | text = "CIA and FBI can track anyone, and you willingly give the data away" 52 | inputs = tokenizer(text, return_tensors="pt") 53 | 54 | with torch.no_grad(): 55 | outputs = model(**inputs) 56 | prob_model = benchmark.prob_models(outputs) 57 | 58 | print(f"RESULT\tPost-processing raw model outputs: {outputs}") 59 | result = ResultManager(None, text, "NON-GEO", torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"), benchmark, False, False, hub_model) 60 | result.soft_outputs(list([prob_model])) 61 | 62 | ind = np.argwhere(np.round(result.weights[0, :] * 100, 2) > 0) 63 | significant = result.means[0, ind].reshape(-1, 2) 64 | weights = result.weights[0, ind].flatten() 65 | 66 | sig_weights = np.round(weights * 100, 2) 67 | sig_weights = sig_weights[sig_weights > 0] 68 | 69 | print(f"RESULT\t{len(sig_weights)} significant prediction outcome(s):") 70 | 71 | for i in range(len(sig_weights)): 72 | point = f"lon: {' lat: '.join(map(str, significant[i]))}" 73 | print(f"\tOut {i + 1}\t{sig_weights[i]}%\t-\t{point}") 74 | 75 | 76 | # save(local_model, hf_folder, base_model) 77 | # load(hub_model, base_model) 78 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/json_split.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | # specify the input and output file names 5 | folder = "datasets/" 6 | input_file = 'worldwide-twitter-2021.jsonl' 7 | output_file_prefix = f'{folder}worldwide-twitter-2021_' 8 | max_lines_per_file = 20000000 9 | 10 | # read the input file and split the data into multiple files 11 | with open(f"{folder}{input_file}", 'r') as f: 12 | line_count = 0 13 | file_count = 0 14 | out_f = None 15 | for line in f: 16 | if line_count % max_lines_per_file == 0: 17 | # create a new output file 18 | if out_f: 19 | out_f.close() 20 | output_file_name = f'{output_file_prefix}{file_count}.jsonl' 21 | out_f = open(output_file_name, 'w') 22 | file_count += 1 23 | json_obj = json.loads(line.strip()) 24 | out_f.write(f'{line.strip()}\n') 25 | line_count += 1 26 | if out_f: 27 | out_f.close() 28 | 29 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/loss_graph_prob.py: -------------------------------------------------------------------------------- 1 | import tkinter 2 | import matplotlib 3 | matplotlib.use('TkAgg') 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from mpl_toolkits.mplot3d import Axes3D 8 | from mpl_toolkits import mplot3d 9 | 10 | def loss(D, sigma): 11 | numerator = np.exp(-D**2 / (2 * sigma)) 12 | denominator = 2 * np.pi * sigma 13 | return -np.log(numerator / denominator) 14 | 15 | 16 | num = 100 17 | D_vals = np.linspace(0, 1, num) 18 | sigma_vals = np.linspace(1 / (2 * np.pi), 1, num) 19 | D_grid, sigma_grid = np.meshgrid(D_vals, sigma_vals) 20 | loss_vals = loss(D_grid, sigma_grid) 21 | 22 | fig = plt.figure() 23 | ax = fig.add_subplot(111, projection='3d') 24 | 25 | surf = ax.plot_surface(D_grid, sigma_grid, loss_vals, cmap='coolwarm', alpha = 0.7) 26 | cntr = ax.contour3D(D_grid, sigma_grid, loss_vals, 100, cmap='coolwarm') 27 | 28 | sigma_const = 1 / (2 * np.pi) 29 | sigma_vals_off = np.linspace(0.001, sigma_const, num) 30 | 31 | D_grid_off, sigma_grid_off = np.meshgrid(D_vals, sigma_vals_off) 32 | loss_vals_off = loss(D_grid_off, sigma_grid_off) 33 | loss_vals_off_clipped = np.clip(loss_vals_off, -1, np.max(loss_vals)) 34 | 35 | cntr_off = ax.contour3D(D_grid_off, sigma_grid_off, loss_vals_off_clipped, 100, cmap='coolwarm', alpha=0.5) 36 | 37 | 38 | loss_wall = np.linspace(-1, np.max(loss_vals), num) 39 | D, L = np.meshgrid(D_grid, loss_wall) 40 | 41 | bound = ax.plot_surface(D, np.ones_like(D) * sigma_const, L, facecolor="black", alpha=0.2) 42 | 43 | sigma_vals_full = np.linspace(0.001, 1, num) 44 | D_grid_full, sigma_grid_full = np.meshgrid(D_vals, sigma_vals_full) 45 | loss_zero = ax.plot_surface(D_grid_full, sigma_grid_full, loss_vals*0, facecolor="black", alpha = 0.2) 46 | 47 | 48 | ax.set_xlabel(r'$D^2$') 49 | ax.set_ylabel(r'$\sigma$') 50 | ax.set_zlabel('Loss') 51 | 52 | ax.set_title("Negative Log-Likelihood Loss") 53 | 54 | ax.text(-0.2, sigma_const, -1, r'$\frac{1}{2\pi}$', color='red', fontsize=14, ha='center', va='center') 55 | ax.text(-0.2, -0.2, 0, 'min', color='red', fontsize=12, ha='center', va='center') 56 | 57 | 58 | ax.set_zlim(-1, np.max(loss_vals)) 59 | ax.set_xlim(0, 1) 60 | ax.set_ylim(0, 1) 61 | 62 | fig.colorbar(surf, shrink=0.9, pad=0.1, location="left") 63 | 64 | ax.view_init(30, 164) 65 | 66 | plt.savefig("loss_graph.png", dpi=600) 67 | 68 | plt.show() 69 | 70 | 71 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/loss_graph_spat.py: -------------------------------------------------------------------------------- 1 | import tkinter 2 | import matplotlib 3 | matplotlib.use('TkAgg') 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from mpl_toolkits.mplot3d import Axes3D 8 | from mpl_toolkits import mplot3d 9 | 10 | def loss(x, y): 11 | return np.power(x, 2) + np.power(y, 2) 12 | 13 | num = 100 14 | 15 | X_vals = np.linspace(0, 140, num) 16 | Y_vals = np.linspace(0, 120, num) 17 | 18 | X_grid, Y_grid = np.meshgrid(X_vals, Y_vals) 19 | 20 | loss_vals = loss(X_grid, Y_grid) 21 | 22 | fig = plt.figure() 23 | ax = fig.add_subplot(111, projection='3d') 24 | 25 | surf = ax.plot_surface(X_grid, Y_grid, loss_vals, cmap='coolwarm', alpha = 0.7) 26 | cntr = ax.contour3D(X_grid, Y_grid, loss_vals, 100, cmap='coolwarm') 27 | 28 | loss_zero = ax.plot_surface(X_grid, Y_grid, loss_vals*0, facecolor="black", alpha = 0.2) 29 | loss_zero = ax.plot_surface(X_grid, Y_grid, loss_vals*0 + 15000, facecolor="black", alpha = 0.2) 30 | 31 | 32 | ax.set_xlabel(r'$\Delta Y_{lon}$') 33 | ax.set_ylabel(r'$\Delta Y_{lat}$') 34 | ax.set_zlabel('Loss') 35 | 36 | ax.set_title("Squared Euclidean Distance") 37 | 38 | ax.set_zlim(-1, np.max(loss_vals)) 39 | ax.set_xlim(0, 140) 40 | ax.set_ylim(0, 120) 41 | 42 | fig.colorbar(surf, shrink=0.9, pad=0.1, location="left") 43 | 44 | ax.view_init(30, 164) 45 | 46 | plt.savefig("loss_graph_spat.png", dpi=600) 47 | 48 | plt.show() 49 | 50 | 51 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/ner-gazetteer-test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sys 3 | from shapely.geometry import Point, LineString 4 | import geopandas as gpd 5 | from geopandas import GeoDataFrame 6 | import spacy 7 | import os.path 8 | from os import path 9 | from geopy.geocoders import Nominatim 10 | from geopy.exc import GeocoderTimedOut 11 | import geopy.distance 12 | import matplotlib.pyplot as plt 13 | 14 | train_set = "geo.jsonl" 15 | temp = "nlp.jsonl" 16 | test = "test.jsonl" 17 | result = "result-small.jsonl" 18 | 19 | empty = "empty-twitter-example-big.jsonl" 20 | 21 | nlp_model = "en_core_web_sm" 22 | world_model = "naturalearth_lowres" 23 | 24 | # test run for NER + gazetteer approach 25 | 26 | 27 | def read_json(file): 28 | print("=== Reading data from:", file) 29 | try: 30 | df = pd.read_json(path_or_buf=file, lines=True) 31 | #print(df.head(5)) 32 | print(df) 33 | except: 34 | print("=== Can't read data from:", file) 35 | 36 | return df 37 | 38 | 39 | def coords_on_map(df, long="long", lat="lat", clr="red"): 40 | geometry = [Point(xy) for xy in zip(df[long], df[lat])] 41 | gdf = GeoDataFrame(df, geometry=geometry) 42 | world = gpd.read_file(gpd.datasets.get_path(world_model)) 43 | 44 | gdf.plot(ax=world.plot(color='white', 45 | edgecolor='black', 46 | figsize=(20, 16)), 47 | marker='o', 48 | color=clr, 49 | markersize=5); 50 | 51 | 52 | def lines_on_map(df, x1="long", y1="lat", x2="long_gc", y2="lat_gc", clr="red"): 53 | world_ax = gpd.read_file(gpd.datasets 54 | .get_path(world_model)).plot( 55 | color='white', 56 | edgecolor='yellow', 57 | figsize=(20, 16)) 58 | 59 | geometry = [LineString([[x1,y1], [x2,y2]]) for 60 | x1,y1,x2,y2 in zip(df[x1], df[y1], df[x2], df[y2])] 61 | gdf = GeoDataFrame(df, geometry=geometry) 62 | gdf.plot(ax=world_ax, marker=None, color="black", markersize=1); 63 | 64 | geometry = [Point(xy) for xy in zip(df[x2], df[y2])] 65 | gdf = GeoDataFrame(df, geometry=geometry) 66 | gdf.plot(ax=world_ax, marker='o', color=clr, markersize=5, zorder=3) 67 | 68 | geometry = [Point(xy) for xy in zip(df[x1], df[y1])] 69 | gdf = GeoDataFrame(df, geometry=geometry) 70 | gdf.plot(ax=world_ax, marker='o', color="green", markersize=10, zorder=2) 71 | 72 | 73 | def spacy_ner(df): 74 | if not path.exists("model"): 75 | print("=== Local folder of NLP model not found. Downloading") 76 | spacy.cli.download(nlp_model) 77 | nlp = spacy.load('model') 78 | nlp.to_disk('model') 79 | else: 80 | nlp = spacy.load('model') 81 | 82 | df['gpe'] = None 83 | df['loc'] = None 84 | df['match_geo'] = False 85 | 86 | print("=== NER Processing text of tweets to get GPE and LOC") 87 | 88 | for ind in df.index: 89 | doc = nlp(df['text'][ind]) 90 | for ent in doc.ents: 91 | if (ent.label_ == 'GPE'): 92 | gpe = ent.text 93 | df['gpe'][ind] = gpe 94 | if gpe: 95 | df['match_geo'][ind] = str(df['place'][ind]).find(gpe) != -1 96 | 97 | if (ent.label_ == 'LOC'): 98 | loc = ent.text 99 | df['loc'][ind] = loc 100 | if loc: 101 | df['match_geo'][ind] = str(df['place'][ind]).find(loc) != -1 102 | 103 | print("===", str(round(sum(df["match_geo"])/len(df["id"])*100, 2)), 104 | "% of ALL processed data match decoded Geo/Loc") 105 | 106 | return df 107 | 108 | 109 | def geocoding(df, city_country=True, coords=False): 110 | geolocator = Nominatim(user_agent="geoapiExercises") 111 | 112 | print("OpenStreetMap Geolocator is set up") 113 | 114 | def city_country(row): 115 | try: 116 | coord = f"{row['lat']}, {row['long']}" 117 | location = geolocator.reverse(coord, exactly_one=True, timeout=None) 118 | address = location.raw['address'] 119 | city = address.get('city', '') 120 | country = address.get('country', '') 121 | row['city'] = city 122 | row['country'] = country 123 | except GeocoderTimedOut as e: 124 | print("Error: geocode failed on input %s with coordinates %s"%(f"{row['lat']}, {row['long']}", e.message)) 125 | 126 | if row['city'] is None or row['country'] is None: 127 | print(row["id"], "not found for Coords:", row['long'], row["lat"], 128 | "==> City:", row['city'], "Country:", row["country"]) 129 | 130 | return row 131 | 132 | def coords(row): 133 | try: 134 | location = geolocator.geocode(row['gpe'], timeout=None) 135 | if location: 136 | row['lat_gc'] = location.latitude 137 | row['long_gc'] = location.longitude 138 | except GeocoderTimedOut as e: 139 | print("Error: geocode failed on input %s with GPE %s"%(row['gpe'], e.message)) 140 | 141 | if row['long_gc'] is None: 142 | print(row["id"], "not found for GPE:", row["gpe"], 143 | "==> Long:", row['long_gc'], "Lat:", row["lat_gc"]) 144 | 145 | return row 146 | 147 | if coords: 148 | print("=== Geocoding GPE to long, lat in progress") 149 | 150 | df["long_gc"] = None 151 | df["lat_gc"] = None 152 | 153 | df = df.apply(coords, axis=1) 154 | 155 | print("===", str(round(sum(df["long_gc"].notna())/len(df["id"])*100, 2)), 156 | "% of processed data attributed coordinates") 157 | 158 | if city_country: 159 | print("=== Reverse geocoding long, lat to city, country in progress") 160 | 161 | df["city"] = None 162 | df["country"] = None 163 | 164 | df = df.apply(city_country, axis=1) 165 | 166 | print("===", str(round(sum(df["city"].notna())/len(df["id"])*100, 2)), 167 | "% of processed data attributed city") 168 | print("===", str(round(sum(df["country"].notna())/len(df["id"])*100, 2)), 169 | "% of processed data attributed country") 170 | 171 | return df 172 | 173 | 174 | def save_df(file, df): 175 | with open(file, "w") as f: 176 | df.to_json(f, orient='records', lines=True) 177 | print("=== Data saved to file", file) 178 | 179 | 180 | def read_args(train=False, empty=False): 181 | try: # 1 arg - data imput 182 | file = sys.argv[1] 183 | except IndexError: 184 | file = train_set 185 | 186 | if train: 187 | train_data = read_json(file) 188 | #coords_on_map(train_data) 189 | else: 190 | train_data = None 191 | 192 | try: # 2 arg - data imput withut geo 193 | file = sys.argv[2] 194 | 195 | except IndexError: 196 | file = empty 197 | 198 | if empty: 199 | empty_data = read_json(file) 200 | else: 201 | empty_data = None 202 | 203 | return train_data, empty_data 204 | 205 | 206 | def main(): 207 | train_data, empty_data = read_args(True, False) # load train and empty set 208 | 209 | # NER layer 210 | geo = spacy_ner(train_data) 211 | geo = geo[geo['gpe'].notna()] 212 | #print(geo.head(5)) 213 | #print(geo.info()) 214 | print("===", str(round(sum(geo["match_geo"])/len(geo["id"])*100, 2)), 215 | "% of SUCCESSFULLY processed data match decoded Geo/Loc") 216 | save_df(temp, geo) 217 | 218 | # Geocoding layer 219 | geo = read_json(temp) 220 | df = geocoding(geo, True, True) 221 | save_df(temp, df) 222 | 223 | # Coordinates processing 224 | df = read_json(temp) 225 | df = df[df['long_gc'].notna()] 226 | 227 | def distance(row): 228 | dist = geopy.distance.geodesic( 229 | (row['lat'],row['long']),(row['lat_gc'],row['long_gc'])) 230 | row["dist"] = dist.meters 231 | return row 232 | 233 | df = df.apply(distance, axis=1) 234 | print(df.info()) 235 | save_df(result, df) 236 | 237 | x=df.loc[df['match_geo']==1, 'dist'] 238 | y=df.loc[df['match_geo']==0, 'dist'] 239 | 240 | bins=list(range(100)) 241 | 242 | plt.figure(figsize=(18,8)) 243 | plt.title("Distance of coords mismatch by match in 'place' column") 244 | plt.hist(x, bins, alpha=0.5, label='true') 245 | plt.hist(y, bins, alpha=0.5, label='false') 246 | plt.legend(loc='upper right') 247 | plt.show() 248 | 249 | lines_on_map(df) 250 | 251 | 252 | if __name__ == "__main__": 253 | main() 254 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/old_project_examples/extract_tweets_with_smileys.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | import pyarrow as pa 6 | import pyarrow.parquet as pq 7 | import re 8 | import sys 9 | 10 | POSITIVE_SMILEYS = "😹😅😺😽😂🐱💌🌚😗💏😌💚❤😆👍💑💞😘🎉😛😚💓👌💖💛😙😁🐶😄👏😍💙🏆😊😀♡😻💘😝💗😃😜😇✌😎😋💕😉😸♥👻🌝🙂🤓🤗🤠🤣🤩🤪🥰🥳" # 💯 11 | NEGATIVE_SMILEYS = "🤨😒🙄😬🤥🤢🤮🥵🥶😵🤯😕😟🙁😮😯😲😳😦😧😨😰😥😢😱😖😣😞😓😩😫🥱😤😡😠🤬😈👿🖕👎😔🙀😾💀😭😪💔😿🙍👺🙎" # removed for ambiguity: 😭🤔🤐✊👊🥺😷💩🆘🥴💧 12 | 13 | def parse_tweet(obj): 14 | try: 15 | if 'extended_tweet' in obj: 16 | txt = obj['extended_tweet']['full_text'] 17 | else: 18 | txt = obj['text'] 19 | except KeyError: 20 | txt = None 21 | 22 | idstr = obj['id_str'] 23 | lang = obj['lang'] 24 | return idstr,txt,lang 25 | 26 | try: 27 | filename = sys.argv[1] 28 | except IndexError: 29 | filename = '/nfs/scistore14/chlgrp/chl/twitter-stream/data/corona/2020/02/corona-2020-02-25.txt' 30 | 31 | positive_smileys_re = re.compile(u'['+POSITIVE_SMILEYS+']') 32 | negative_smileys_re = re.compile(u'['+NEGATIVE_SMILEYS+']') 33 | 34 | positive_translation_table = str.maketrans("\n\r", " ", POSITIVE_SMILEYS) 35 | negative_translation_table = str.maketrans("\n\r", " ", NEGATIVE_SMILEYS) 36 | 37 | def filter(fid, min_length=10, skip_retweets=True): 38 | known_ids = set() 39 | for line in fid: 40 | if not line: 41 | continue 42 | try: 43 | obj = json.loads(line) 44 | except (json.decoder.JSONDecodeError, TypeError): 45 | #print("ERROR: entry wasn't a dictionary. skipping.", file=sys.stderr) 46 | continue 47 | 48 | try: 49 | if 'id_str' not in obj: 50 | print("ERROR: 'id' field not found in tweet", file=sys.stderr) 51 | continue 52 | if 'created_at' not in obj: 53 | print("ERROR: 'created_at' field not found in tweet {}".format(tweet['id']), file = sys.stderr) 54 | continue 55 | if 'retweeted_status' in obj and skip_retweets: # skip retweets 56 | continue 57 | except TypeError: 58 | print("ERROR: not a dict?", line, obj, file=sys.stderr) 59 | continue 60 | 61 | idstr, txt, lang = parse_tweet(obj) 62 | if not txt: # no text 63 | continue 64 | if idstr in known_ids: # duplicate 65 | continue 66 | known_ids.add(idstr) 67 | 68 | pos = re.findall(positive_smileys_re, txt) 69 | neg = re.findall(negative_smileys_re, txt) 70 | if not pos and not neg: 71 | continue # drop, because no smiley 72 | if pos and neg: 73 | continue # drop, because confusing 74 | if pos and not neg: 75 | txt = txt.translate(positive_translation_table) 76 | label = 1 77 | elif neg and not pos: 78 | txt = txt.translate(negative_translation_table) 79 | label = 0 80 | 81 | if len(txt) token (equiv. to [CLS]) 39 | x = self.out_proj(x) 40 | return x 41 | 42 | def main(): 43 | parser = argparse.ArgumentParser(description='Finetune multilingual transformer model') 44 | parser.add_argument('-o','--output_file', dest='output_file', type=str, default=None, help='Output file for predictions.') 45 | parser.add_argument('-n','--nepochs', type=int, default=1, help='Number of epochs to train') 46 | parser.add_argument('-m','--modelname', type=str, default='models/model.bin', help='Output filename for model (leave empty for not saving)') 47 | parser.add_argument('-t','--tokenizername', type=str, default='models/tokenizer.bin', help='Output filename for tokenizer (leave empty for not saving)') 48 | parser.add_argument('-T','--nthreads', type=int, default=None, help='Number of threads for tokenizing (default: OMP_NUM_THREADS)') 49 | parser.add_argument('-s','--seed', type=int, default=0, help='Random seed (default: 0)') 50 | parser.add_argument('-b','--batchsize', type=int, default=24, help='Per-device batchsize (default: 24)') 51 | parser.add_argument('-l','--linear', action="store_true", help="Train only linear model") 52 | parser.add_argument('--fp16', action="store_true", help="Use fp16 mixed precision") 53 | parser.add_argument('--fp32', dest='fp16', action="store_false", help="Use fp32 precision") 54 | parser.add_argument('-L','--lrscheduler', type=str, default="cosine", choices=['linear', 'cosine', 'cosine_with_restarts','polynomial','constant','constant_with_warmup'], help="Learning Rate Scheduler") 55 | parser.add_argument('files', nargs="+", type=str, metavar='INPUTFILES', help="Input files (in parquet format)") 56 | args = parser.parse_args() 57 | 58 | if not args.files: 59 | print("Usage: {} INPUTFILES".format(sys.argv[0])) 60 | raise SystemExit 61 | 62 | print(f"Loading {len(args.files)} data files.") 63 | df = load_data(args.files) 64 | 65 | tokenizer = XLMRobertaTokenizer.from_pretrained('sentence-transformers/stsb-xlm-r-multilingual') 66 | if args.tokenizername: 67 | print(f"Writing tokenizer {args.tokenizername}") 68 | tokenizer.save_pretrained(args.tokenizername) 69 | 70 | num_workers = get_num_workers(args.nthreads) 71 | print(f"Using pool with {num_workers} workers for tokenization.") 72 | tokenized_results = tokenize_dataset(df, tokenizer=tokenizer, num_workers=num_workers) 73 | 74 | # insert results 75 | df['input_ids'] = sum([part['input_ids'] for part in tokenized_results],[]) 76 | df['attention_mask'] = sum([part['attention_mask'] for part in tokenized_results],[]) 77 | 78 | val_size = min(len(df)//10, 10000) 79 | df_train, df_val = train_test_split(df, test_size=val_size, random_state=args.seed) 80 | del df 81 | data_train = TwitterDataset(df_train) 82 | del df_train 83 | data_val = TwitterDataset(df_val) 84 | del df_val 85 | 86 | if args.linear: 87 | # create model with frozen sentence embedding part 88 | with torch.no_grad(): 89 | model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/stsb-xlm-r-multilingual') 90 | for param in model.parameters(): 91 | param.requires_grad = False 92 | model.classifier = LinearHead(hidden_size=768, num_labels=2) 93 | else: 94 | model = AutoModelForSequenceClassification.from_pretrained('sentence-transformers/stsb-xlm-r-multilingual') 95 | 96 | num_gpus = max(1, torch.cuda.device_count()) 97 | print(f"Using {num_gpus} devices to train.") 98 | training_args = TrainingArguments( 99 | output_dir='./results/', 100 | report_to = "all", 101 | adafactor=True, # not using AdamW, let's see 102 | learning_rate=5e-5, # default is 5e-5 103 | num_train_epochs=args.nepochs, 104 | per_device_train_batch_size=args.batchsize, 105 | per_device_eval_batch_size=args.batchsize, 106 | #warmup_steps=500, # number of warmup steps for learning rate scheduler 107 | warmup_steps=0.1, # number of warmup steps for learning rate scheduler 108 | #lr_scheduler_type=args.lrscheduler, 109 | #weight_decay=0.0, # strength of weight decay 110 | logging_dir='./logs/', # directory for storing logs 111 | logging_steps=100, # log often 112 | evaluation_strategy="steps", 113 | eval_steps=500, # evaluate often 114 | save_strategy="epoch", # save rarely 115 | fp16 = args.fp16, 116 | #fp16_full_eval=True 117 | ) 118 | num_steps_per_epoch = len(data_train)//(args.batchsize*num_gpus) 119 | num_updates = num_steps_per_epoch * args.nepochs 120 | if num_updates > 5000: 121 | training_args.eval_steps = 1000 # evaluate less often for big datasets 122 | if num_updates < 100: 123 | training_args.logging_steps = 1 124 | training_args.eval_steps = num_updates//5 125 | elif num_updates < 1000: 126 | training_args.logging_steps = 10 127 | training_args.eval_steps = num_updates//10 128 | 129 | optimizer = Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=5e-5) 130 | lr_scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, 131 | num_warmup_steps=0, 132 | num_training_steps=num_updates, 133 | num_cycles=args.nepochs) 134 | 135 | trainer = Trainer( 136 | model=model, 137 | args=training_args, 138 | train_dataset=data_train, 139 | eval_dataset=data_val, 140 | compute_metrics=compute_metrics, 141 | optimizers=(optimizer, lr_scheduler) 142 | ) 143 | trainer.train() 144 | trainer.evaluate() 145 | 146 | del data_train, data_val 147 | 148 | if args.modelname: 149 | print(f"Writing final model {args.modelname}") 150 | trainer.save_model(args.modelname) 151 | 152 | if __name__ == "__main__": 153 | main() 154 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/old_project_examples/tweet_utils.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | import multiprocessing 3 | import numpy as np 4 | import os 5 | import pandas as pd 6 | import pyarrow.parquet as pq 7 | from sklearn.metrics import accuracy_score, roc_auc_score 8 | import torch 9 | 10 | def load_parquet(filename): 11 | df = pq.read_table(filename).to_pandas() 12 | df.set_index('id') 13 | return df 14 | 15 | def load_data(filenames): 16 | df = pd.concat(load_parquet(f) for f in filenames) 17 | return df 18 | 19 | class TwitterDataset(torch.utils.data.Dataset): 20 | def __init__(self, df): 21 | self.input_ids = df.input_ids.to_numpy() 22 | self.attention_mask = df.attention_mask.to_numpy() 23 | self.labels = df.label.to_numpy() 24 | 25 | def __getitem__(self, idx): 26 | item = {} 27 | item['input_ids'] = torch.tensor(self.input_ids[idx]) 28 | item['attention_mask'] = torch.tensor(self.attention_mask[idx]) 29 | item['labels'] = torch.tensor(self.labels[idx]) 30 | return item 31 | 32 | def __len__(self): 33 | return len(self.labels) 34 | 35 | def compute_metrics(pred): 36 | labels = pred.label_ids 37 | scores = pred.predictions[:,1]-pred.predictions[:,0] # logits, so not normalized 38 | preds = pred.predictions.argmax(axis=-1) 39 | acc = accuracy_score(labels, preds) 40 | auc = roc_auc_score(labels, scores) 41 | return { 'accuracy': acc, 'auc': auc } 42 | 43 | def tokenize(tokenizer, df): 44 | return tokenizer.batch_encode_plus(df.text, padding="max_length", max_length=160, truncation=True) 45 | 46 | def tokenize_dataset(df, tokenizer, num_workers): 47 | print(f"Will use pool with {num_workers} workers for tokenization.") 48 | with multiprocessing.Pool(num_workers) as pool: 49 | chunks = np.array_split(df, num_workers) 50 | tokenized_results = pool.map(partial(tokenize, tokenizer), chunks) 51 | return tokenized_results 52 | 53 | def get_num_workers(max_nthreads): 54 | num_workers = max(1, multiprocessing.cpu_count()-1) 55 | if max_nthreads: 56 | num_workers = min(num_workers, max_nthreads) 57 | else: 58 | num_workers = min(num_workers, int(os.environ.get("OMP_NUM_THREADS", 1000000))) 59 | return num_workers 60 | -------------------------------------------------------------------------------- /supplementary_resources/scripts/python/transformers-tutorial-test.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | import numpy as np 3 | from datasets import load_metric 4 | from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer 5 | from torch import nn 6 | import torch 7 | 8 | # test tutorial transformers run 9 | 10 | dataset = load_dataset("yelp_review_full") 11 | 12 | print(dataset["train"][100]) 13 | dataset_df = dataset["train"].to_pandas() 14 | print(dataset_df.head()) 15 | 16 | features = dataset["train"].features 17 | print(features) 18 | 19 | print(dataset_df["label"].value_counts(normalize=True).sort_index()) 20 | dataset = dataset.rename_column("label", "labels") 21 | 22 | tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") 23 | 24 | 25 | def tokenize_function(examples): 26 | return tokenizer(examples["text"], padding="max_length", truncation=True) 27 | 28 | 29 | tokenized_datasets = dataset.map(tokenize_function, batched=True) 30 | print(tokenized_datasets) 31 | 32 | small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) 33 | small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) 34 | 35 | model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5) 36 | metric = load_metric("accuracy") 37 | 38 | 39 | def compute_metrics(eval_pred): 40 | logits, labels = eval_pred 41 | predictions = np.argmax(logits, axis=-1) 42 | return metric.compute(predictions=predictions, references=labels) 43 | 44 | 45 | class_weights = (1 - (dataset_df["label"].value_counts().sort_index() / len(dataset_df))).values 46 | print(class_weights) 47 | class_weights = torch.from_numpy(class_weights).float() 48 | print(class_weights) 49 | 50 | 51 | class WeightedLossTrainer(Trainer): 52 | def compute_loss(selfself, model, inputs, return_outputs=False): 53 | outputs = model(**inputs) 54 | logits = outputs.get("logits") 55 | labels = inputs.get("labels") 56 | loss_func = nn.CrossEntropyLoss(weight=class_weights) 57 | loss = loss_func(logits, labels) 58 | return (loss, outputs) if return_outputs else loss 59 | 60 | 61 | batch_size = 8 62 | logging_steps = len(dataset["train"]) // batch_size 63 | output_dir = "test_trainer" 64 | training_args = TrainingArguments(output_dir=output_dir, 65 | num_train_epochs=3, 66 | learning_rate=2e-5, 67 | per_device_train_batch_size=batch_size, 68 | per_device_eval_batch_size=batch_size, 69 | weight_decay=0.01, 70 | logging_steps=logging_steps, 71 | evaluation_strategy="epoch") 72 | 73 | trainer = WeightedLossTrainer( 74 | model=model, 75 | args=training_args, 76 | train_dataset=small_train_dataset, 77 | eval_dataset=small_eval_dataset, 78 | compute_metrics=compute_metrics, 79 | ) 80 | 81 | trainer.train() 82 | 83 | trainer.evaluate() 84 | 85 | trainer.save_model('save_test/model') 86 | # alternative saving method and folder 87 | model.save_pretrained('saving_test') 88 | -------------------------------------------------------------------------------- /train_bert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from transformers import BertTokenizer 3 | from utils.model_trainer import * 4 | 5 | # Entry point for training and evaluation of the models 6 | 7 | f = ["GEO", "NON-GEO", "META-DATA", "TEXT-ONLY", "GEO-ONLY", "USER-ONLY"] 8 | 9 | dataset_file = "test-3877395-filtered-16219676-us-twitter-2021.jsonl" # .jsonl 10 | features = [f[1], f[4]] 11 | val_f = f[1] # None -> features[0] 12 | target_columns = ["lon", "lat"] 13 | 14 | original_worldwide_model = "bert-base-multilingual-cased" 15 | original_usa_model = "bert-base-cased" 16 | 17 | # parameters = dict( 18 | # max_lr = [5e-5, 1e-5], 19 | # min_lr = [5e-6, 1e-6, 1e-8, 1e-16], 20 | # scheduler = ["cosine", "plateau"] 21 | # ) 22 | # param_values = [v for v in parameters.values()] 23 | 24 | covariance_types = [None, "spher"] # [None, "full", "spher", "diag", "tied"] 25 | scheduler_types = ["cosine", "linear", "plateau"] # ["cosine", "linear", "cosine-long", "plateau", "step", "multi step", "one cycle", "cyclic"] 26 | 27 | loss_distance = True 28 | loss_mf = "mean" # mean/sum - mean if features > 1 29 | loss_prob = "pos" # all/pos - pos if prob 30 | loss_total = "mean" # sum/mean/type - mean if prob else type (spat) 31 | 32 | outcomes = 5 33 | covariance = covariance_types[1] # None/spher 34 | 35 | epochs = 3 36 | log_step = 1000 37 | 38 | batch_size = 4 39 | 40 | lr_max = 1e-5 41 | lr_min = 1e-6 42 | scheduler = scheduler_types[0] 43 | 44 | val_size = 1000 # samples/users if -vu 45 | threshold = 100 46 | 47 | train_size = 0 48 | test_ratio = 0.1 49 | seed = 42 50 | 51 | ref_file = None # "us-twitter-2020.jsonl" # if not None exclude found users from the current data 52 | bot_filter = False 53 | 54 | 55 | def main(): 56 | parser = argparse.ArgumentParser(description='Finetune multilingual transformer model') 57 | parser.add_argument('-n', '--nepochs', type=int, default=epochs, help='Number of epochs to train') 58 | parser.add_argument('-ss', '--skip', type=int, default=0, help='Number of dataset samples to skip') 59 | 60 | parser.add_argument('-sc', '--scale_coord', action="store_true", help="Keep coordinates unscaled (default: True)") 61 | parser.add_argument('-o', '--outcomes', type=int, default=outcomes, help="Number of outcomes (lomg, lat) per tweet") 62 | parser.add_argument('-c', '--covariance', type=str, default=covariance, help="Covariance matrix type") 63 | parser.add_argument('-nw', '--weighted', action="store_false", help="Weights of GMM are not equal (default: True)") 64 | 65 | parser.add_argument('-ld', '--loss_dist', action="store_false", help="Distance loss criterion (default: True)") 66 | parser.add_argument('-lmf', '--loss_mf', type=str, default=loss_mf, help="Multi feature loss handle mean or sum (default: mean)") 67 | parser.add_argument('-lp', '--loss_prob', type=str, default=loss_prob, help="Probabilistic loss domain all or pos (default: all)") 68 | parser.add_argument('-lt', '--loss_total', type=str, default=loss_total, help="Total loss handle by model type or sum (default: type)") 69 | 70 | parser.add_argument('-m', '--local_model', type=str, default=None, help='Filename prefix of local model') 71 | parser.add_argument('--nockp', action="store_false", help='Saving model checkpoints during training (preset: True)') 72 | 73 | parser.add_argument('-lr', '--learn_rate', type=float, default=lr_max, help='Learning rate (default: 4e-5)') 74 | parser.add_argument('-lrm', '--learn_rate_min', type=float, default=lr_min, help='Learning rate minimum (default: 1e-8)') 75 | parser.add_argument('-sdl', '--scheduler', type=str, default=scheduler, help="Scheduler type") 76 | 77 | parser.add_argument('-b', '--batch_size', type=int, default=batch_size, help='Per-device batch size (default: 22)') 78 | parser.add_argument('-ls', '--log_step', type=int, default=log_step, help='Log step (default: 1000)') 79 | 80 | parser.add_argument('-us', '--usa_model', action="store_true", help="Use USA model instead of worldwide (default: False)") 81 | parser.add_argument('-d', '--dataset', type=str, default=dataset_file, help="Input dataset (in jsonl format)") 82 | parser.add_argument('-f', '--features', default=features, nargs='+', help="Features names") 83 | parser.add_argument('-ts', '--train_size', type=int, default=train_size, help='Training dataloader size') 84 | parser.add_argument('-tr', '--test_ratio', type=float, default=test_ratio, help='Training dataloader test ratio (default: 0.1)') 85 | parser.add_argument('-s', '--seed', type=int, default=seed, help='Random seed (default: 42)') 86 | parser.add_argument('-v', '--val_size', type=int, default=val_size, help='Validation dataloader size') 87 | parser.add_argument('-th', '--threshold', type=int, default=threshold, help='Validation threshold in km (default: 200)') 88 | parser.add_argument('-vu', '--val_user', action="store_true", help="Form validation dataset by user (default: False)") 89 | 90 | parser.add_argument('--train', action="store_true", help="Start pretraining") 91 | parser.add_argument('--eval', action="store_true", help="Start evaluation") 92 | parser.add_argument('--hptune', action="store_true", help="Start training with hyper parameters tuning") 93 | args = parser.parse_args() 94 | 95 | if args.local_model is None: 96 | prefix = f"{'US-' if args.usa_model else ''}{'U-' if not args.scale_coord else ''}{'+'.join(args.features)}-O{args.outcomes}-{'d' if args.loss_dist else 'c'}-" \ 97 | f"total_{args.loss_total if args.covariance is not None else 'type'}-{'mf_' + args.loss_mf + '-' if len(args.features) > 1 else ''}" \ 98 | f"{args.loss_prob + '_' if args.covariance is not None else ''}{args.covariance if args.covariance is not None else 'NP'}-" \ 99 | f"{'weighted-' if args.weighted and args.outcomes > 1 else ''}N{args.train_size//100000}e5-" \ 100 | f"B{args.batch_size}-E{args.nepochs}-{args.scheduler}-LR[{args.learn_rate};{args.learn_rate_min}]" 101 | else: 102 | prefix = args.local_model 103 | 104 | print(f"Model prefix:\t{prefix}") 105 | if torch.cuda.is_available(): 106 | print(f"DEVICE\tAvailable GPU has {torch.cuda.device_count()} devices, using {torch.cuda.get_device_name(0)}") 107 | print(f"DEVICE\tCPU has {torch.get_num_threads()} threads") 108 | else: 109 | print(f"DEVICE\tNo GPU available, using the CPU with {torch.get_num_threads()} threads instead.") 110 | 111 | original_model = original_usa_model if args.usa_model else original_worldwide_model 112 | 113 | # combine_datasets(["test-10501727-filtered-17174594-worldwide-twitter-2020_0.jsonl", "test-10586286-filtered-17264575-worldwide-twitter-2020_1.jsonl", "test-3783510-filtered-6464689-worldwide-twitter-2020_2.jsonl"], "test-filtered-worldwide-twitter-2020.jsonl") 114 | 115 | dataloader = TwitterDataloader(args.dataset, 116 | args.features, 117 | target_columns, 118 | BertTokenizer.from_pretrained(original_model), 119 | args.seed, 120 | args.scale_coord, 121 | val_f, 122 | bot_filter) 123 | 124 | # no settings run to save filtered by condition dataset copy 125 | # dataloader.filter_dataset("code", "US", None) 126 | 127 | trainer = ModelTrainer(prefix, 128 | dataloader, 129 | args.nepochs, 130 | args.batch_size, 131 | args.outcomes, 132 | args.covariance, 133 | args.weighted, 134 | args.loss_dist, 135 | args.loss_mf, 136 | args.loss_prob, 137 | args.loss_total, 138 | args.learn_rate, 139 | args.learn_rate_min, 140 | original_model) 141 | 142 | # if args.hptune: 143 | # trainer.hp_tuning(args.train_size, 144 | # args.test_ratio, 145 | # param_values, 146 | # args.log_step) 147 | 148 | if args.train: 149 | trainer.finetune(args.train_size, 150 | args.test_ratio, 151 | f"{prefix}.pth", 152 | args.nockp, 153 | args.log_step, 154 | args.scheduler, 155 | args.skip) 156 | 157 | if args.eval: 158 | trainer.eval(args.val_size, 159 | args.threshold, 160 | args.val_size, 161 | args.val_user, 162 | args.train_size, 163 | ref_file) 164 | 165 | 166 | if __name__ == "__main__": 167 | main() 168 | -------------------------------------------------------------------------------- /utils/benchmarks.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.distributions as dist 4 | import numpy as np 5 | from utils.regressor import * 6 | 7 | # model benchmarks for loss and metrics 8 | 9 | 10 | # R2 score 11 | def r2_score(X, Y): 12 | labels_mean = torch.mean(Y) 13 | ss_tot = torch.sum((Y - labels_mean) ** 2) 14 | ss_res = torch.sum((Y - X) ** 2) 15 | r2 = 1 - ss_res / ss_tot 16 | return r2 17 | 18 | 19 | # spher cov from raw outputs 20 | def spher_sigma(X, outcomes, outputs_map, lower_limit=0): 21 | softplus = nn.Softplus() 22 | S = softplus(X[:, outputs_map["sigma"][0]:outputs_map["sigma"][1]]) + lower_limit 23 | return S.reshape([X.size(dim=0), outcomes]) 24 | 25 | 26 | # GM/GMM from raw outputs 27 | def GaussianModel(X, outcomes, outputs_map, prob_domain, cov): 28 | softplus = nn.Softplus() 29 | batch = X.size(dim=0) 30 | 31 | means = X[:, outputs_map["coord"][0]:outputs_map["coord"][1]] 32 | if outcomes > 1: 33 | means = means.reshape([batch, outcomes, 2]) 34 | 35 | sigma_lower_limit = 1 / (2 * math.pi) if prob_domain == "pos" else 0 36 | positive_sigma = spher_sigma(X, outcomes, outputs_map, sigma_lower_limit) 37 | 38 | sigma = None 39 | tril = None 40 | 41 | if cov == "spher": 42 | sigma = torch.eye(2, device=X.device) * positive_sigma.reshape(-1, 1)[:, None] 43 | elif cov == "diag": 44 | sigma = torch.eye(2, device=X.device) * positive_sigma.reshape(-1, 2)[:, None] 45 | else: 46 | tril_indices = torch.tril_indices(row=2, col=2, offset=0, device=X.device) 47 | if cov == "tied" or outcomes == 1: 48 | tril = torch.zeros((2, 2), device=X.device).repeat(batch, 1).reshape([batch, 2, 2]) 49 | tril[:, tril_indices[0], tril_indices[1]] = positive_sigma.reshape([batch, 3]) 50 | else: 51 | tril = torch.zeros((2, 2), device=X.device).repeat(batch, outcomes).reshape([batch, outcomes, 2, 2]) 52 | tril[:, :, tril_indices[0], tril_indices[1]] = positive_sigma.reshape([batch, outcomes, 3]) 53 | 54 | if sigma is not None: 55 | if outcomes > 1: 56 | sigma = sigma.reshape([batch, outcomes, 2, 2]) 57 | gaussian = dist.MultivariateNormal(means, sigma) 58 | else: 59 | if outcomes > 1 and cov == "tied": 60 | tril = tril.reshape(batch, -1).repeat(1, outcomes).reshape([batch, outcomes, 2, 2]) 61 | gaussian = dist.MultivariateNormal(means, scale_tril=tril) 62 | 63 | return gaussian 64 | 65 | 66 | # GMM weights from raw outputs 67 | def GaussianWeights(X, outcomes, outputs_map): 68 | softmax = nn.Softmax(dim=1) 69 | weights = X[:, outputs_map["weight"][0]:outputs_map["weight"][1]].reshape([X.size(dim=0), outcomes]) if outputs_map["weight"] else torch.ones((X.size(dim=0), outcomes), device=X.device) 70 | gmm_weights = dist.Categorical(softmax(weights)) 71 | return gmm_weights 72 | 73 | 74 | # spatial weights from raw outputs 75 | def weights(X, outcomes, outputs_map): 76 | softmax = nn.Softmax(dim=1) 77 | W = X[:, outputs_map["weight"][0]:outputs_map["weight"][1]].reshape([X.size(dim=0), outcomes]) if outputs_map["weight"] else torch.ones((X.size(dim=0), outcomes), device=X.device) 78 | return softmax(W) 79 | 80 | 81 | # Distance^2 to genuine truth from raw outputs 82 | def dist2(X, y, outcomes, outputs_map): 83 | def coord_se(X, y, outcomes): 84 | error_by_coord = nn.MSELoss(reduction='none') 85 | Y = y.repeat(1, outcomes) 86 | E = error_by_coord(X, Y) 87 | return E 88 | 89 | E = coord_se(X[:, outputs_map["coord"][0]:outputs_map["coord"][1]], y, outcomes) 90 | D = torch.zeros((X.size(dim=0), 0), device=X.device) 91 | for i in range(outputs_map["coord"][0], outputs_map["coord"][1], 2): 92 | D = torch.cat((D, torch.sum(E[:, i:i+2], dim=1, keepdim=True)), 1) 93 | return D 94 | 95 | 96 | # Negative Log Likelihood fit for genuine truth from raw outputs 97 | def lh_loss(X, y, outcomes, outputs_map, prob_domain): 98 | gaussian = GaussianModel(X, outcomes, outputs_map, prob_domain, "spher") 99 | if outcomes > 1: 100 | gmm_weights = GaussianWeights(X, outcomes, outputs_map) 101 | gmm = dist.MixtureSameFamily(gmm_weights, gaussian) 102 | L = -gmm.log_prob(y) 103 | else: 104 | L = -gaussian.log_prob(y) 105 | return L 106 | 107 | 108 | # weighted D2 loss from raw outputs 109 | def d_loss(X, y, outcomes, outputs_map): 110 | D = dist2(X, y, outcomes, outputs_map) 111 | if outcomes > 1: 112 | W = weights(X, outcomes, outputs_map) 113 | L = torch.sum(D * W, dim=1) 114 | else: 115 | L = D 116 | return L 117 | 118 | 119 | class ModelBenchmark(): 120 | def __init__(self, model, distance=True, loss_prob="pos", mf_loss="mean", total_loss="type"): 121 | self.model = model 122 | self.dist = distance 123 | self.prob_domain = loss_prob 124 | self.mf_handle = mf_loss 125 | self.total_loss_crit = total_loss 126 | 127 | self.outcomes = self.model.n_outcomes 128 | self.cov = self.model.cov 129 | self.weighted = self.model.weighted 130 | self.features = self.model.features 131 | 132 | self.outputs_map = { 133 | "coord": [0, self.model.coord_output], 134 | "weight": [self.model.coord_output, self.model.coord_output + self.model.weights_output] if self.weighted else None, 135 | "sigma": [self.model.coord_output + self.model.weights_output, self.model.coord_output + self.model.weights_output + self.model.cov_output] if self.cov else None 136 | } 137 | 138 | self.single_outputs_map = { 139 | "coord": [0, 2], 140 | "weight": None, 141 | "sigma": [2, 3] if self.cov else None 142 | } 143 | 144 | self.loss_type = 1 if self.outputs_map["sigma"] else 0 145 | 146 | print(f"TRAIN\tLOSS\tKey Feature - {'sum of spat and prob' if self.total_loss_crit == 'sum' else 'by model type'}:\n" 147 | f"\tGeospatial accuracy:\t{'weighted ' if self.weighted else ''}{'distance' if self.dist else 'coord'} error^2 for {self.outcomes} outcome(s)") 148 | 149 | if self.outputs_map["sigma"] is not None: 150 | print(f"\tProbability accuracy:\t {'limited' if self.prob_domain == 'pos' else 'unlimited'} -LLH for PDF of {'weighted ' if self.weighted else ''}" 151 | f"{'GM' if self.outcomes == 1 else 'GMM'} with {self.cov} covariance matrix ") 152 | 153 | if len(self.features) > 1: 154 | print(f"TRAIN\tLOSS\tMinor Features - {self.mf_handle} of:\n\tGeospatial accuracy:\tsingle {'distance' if self.dist else 'coord'} error^2") 155 | if self.outputs_map["sigma"]: 156 | print(f"\tProbability accuracy:\t {'limited' if self.prob_domain == 'pos' else 'unlimited'} -LLH for PDF of single GM with spher covariance matrix ") 157 | 158 | def minor_feature_loss(self, outputs, labels): 159 | X = outputs.squeeze().float() 160 | y = labels.squeeze().float() 161 | 162 | if X.dim() == 1: 163 | X = X.reshape(1, -1) 164 | 165 | spat_loss = d_loss(X, y, 1, self.single_outputs_map).mean() 166 | prob_loss = torch.zeros_like(spat_loss, device=X.device) 167 | if self.outputs_map["sigma"]: 168 | prob_loss = lh_loss(X, y, 1, self.single_outputs_map, self.prob_domain).mean() 169 | return spat_loss, prob_loss 170 | 171 | def key_feature_loss(self, outputs, labels): 172 | X = outputs.squeeze().float() 173 | y = labels.squeeze().float() 174 | 175 | if X.dim() == 1: 176 | X = X.reshape(1, -1) 177 | 178 | spat_loss = d_loss(X, y, self.outcomes, self.outputs_map).mean() 179 | prob_loss = torch.zeros_like(spat_loss, device=X.device) 180 | if self.outputs_map["sigma"]: 181 | prob_loss = lh_loss(X, y, self.outcomes, self.outputs_map, self.prob_domain).mean() 182 | return spat_loss, prob_loss 183 | 184 | def total_batch_loss(self, batch_loss): 185 | all_features_loss = torch.mean(batch_loss, dim=0) if self.mf_handle == "mean" else torch.sum(batch_loss, dim=0) 186 | if self.total_loss_crit == "sum": 187 | total_loss = torch.sum(all_features_loss, dim=0) 188 | elif self.total_loss_crit == "mean": 189 | total_loss = torch.mean(all_features_loss, dim=0) 190 | elif self.total_loss_crit == "type": 191 | total_loss = all_features_loss[1 if self.outputs_map["sigma"] else 0] 192 | return total_loss 193 | 194 | # pytorch GM/GMM from raw outputs 195 | def prob_models(self, outputs): 196 | X = outputs.squeeze().float() 197 | 198 | if X.dim() == 1: 199 | X = X.reshape(1, -1) 200 | 201 | gaussian = GaussianModel(X, self.outcomes, self.outputs_map, self.prob_domain, self.cov) 202 | if self.outcomes > 1: 203 | gmm_weights = GaussianWeights(X, self.outcomes, self.outputs_map) 204 | return dist.MixtureSameFamily(gmm_weights, gaussian) 205 | else: 206 | return gaussian 207 | 208 | # spat and prob loss from raw outputs 209 | def result_metrics(self, outputs, labels): 210 | X = outputs.squeeze().float() 211 | y = labels.squeeze().float() 212 | 213 | if X.dim() == 1: 214 | X = X.reshape(1, -1) 215 | 216 | spat_loss = d_loss(X, y, self.outcomes, self.outputs_map).reshape(-1, 1) 217 | prob_loss = torch.zeros_like(spat_loss, device=X.device) 218 | if self.outputs_map["sigma"]: 219 | prob_loss = lh_loss(X, y, 1, self.single_outputs_map, self.prob_domain).reshape(-1, 1) 220 | 221 | return spat_loss, prob_loss 222 | 223 | def r2(self, outputs, labels): 224 | if outputs.dim() == 1: 225 | outputs = outputs.reshape(1, -1) 226 | 227 | Y = labels.repeat(1, self.outcomes) if self.outcomes > 1 else labels 228 | X = outputs[:, self.outputs_map["coord"][0]:self.outputs_map["coord"][1]] 229 | r2 = r2_score(X, Y) 230 | return r2 231 | 232 | # tensorboard metrics logging 233 | def log(self, writer, step, lr, train_metric, cur_batch, val_metric=None): 234 | def total_loss_log(metric, metric_type="total"): 235 | if metric_type == "val": 236 | atf_loss = metric[:, 0] 237 | folder = f"mean_val" 238 | else: 239 | atf_loss = np.mean(metric[:, :, 0], axis=0) if self.mf_handle == "mean" else np.sum(metric[:, :, 0], axis=0) 240 | folder = f"current_step" if metric_type != "total" else f"mean_train" 241 | 242 | if self.total_loss_crit == "sum": 243 | total_loss = np.sum(atf_loss, axis=0) 244 | elif self.total_loss_crit == "mean": 245 | total_loss = np.mean(atf_loss, axis=0) 246 | elif self.total_loss_crit == "type": 247 | total_loss = atf_loss[1 if self.outputs_map["sigma"] else 0] 248 | 249 | log = f"\tTotal loss of all features:\t{total_loss}" 250 | print(log) 251 | writer.add_scalar(f"{folder}/total_loss", total_loss, step) 252 | 253 | if metric_type == "total": 254 | self.mean_epoch_train_loss = total_loss 255 | 256 | def spat_loss_log(metric, metric_type="total"): 257 | if metric_type == "val": 258 | spat_loss, r2 = metric[0, 0], metric[0, 1] 259 | folder, log = f"mean_val", f"\tGeospatial {spatial} loss:\t{spat_loss}\tCoord R2:\t{r2}" 260 | else: 261 | spat_loss, r2 = np.mean(metric[:, 0, 0], axis=0) if self.mf_handle == "mean" else np.sum(metric[:, 0, 0], axis=0), None 262 | folder, log = f"current_step", f"\tGeospatial {self.mf_handle} {spatial} loss:\t{spat_loss}" 263 | if metric_type == "total": 264 | r2 = metric[0, 0, 1] 265 | folder = f"mean_train" 266 | log += f"\tCoord R2:\t{r2}" 267 | 268 | if metric.shape[0] > 1: 269 | key_spat, minor_spat = metric[0, 0, 0], np.mean(metric[1:, 0, 0], axis=0) 270 | log += f"\n\t\tKey:\t{key_spat}\tMinor:\t{minor_spat}" 271 | writer.add_scalar(f"{folder}/spat_key", key_spat, step) 272 | writer.add_scalar(f"{folder}/spat_minor", minor_spat, step) 273 | 274 | print(log) 275 | writer.add_scalar(f"{folder}/loss_spat", spat_loss, step) 276 | if r2: 277 | writer.add_scalar(f"{folder}/r2", r2, step) 278 | 279 | def prob_loss_log(metric, metric_type="total"): 280 | if metric_type == "val": 281 | prob_loss, pdf = metric[1, 0], metric[1, 1] 282 | folder, log = f"mean_val", f"\tProbabilistic {self.mf_handle} -LLH loss:\t{prob_loss}\tPDF:\t{pdf}" 283 | else: 284 | prob_loss, pdf = np.mean(metric[:, 1, 0], axis=0) if self.mf_handle == "mean" else np.sum(metric[:, 1, 0], axis=0), None 285 | folder, log = f"current_step", f"\tProbabilistic {self.mf_handle} -LLH loss:\t{prob_loss}" 286 | if metric_type == "total": 287 | pdf = metric[0, 1, 1] 288 | folder = f"mean_train" 289 | log += f"\tPDF:\t{pdf}" 290 | 291 | if metric.shape[0] > 1: 292 | key_prob, minor_prob = metric[0, 1, 0], np.mean(metric[1:, 1, 0], axis=0) 293 | log += f"\n\t\tKey:\t{key_prob}\tMinor:\t{minor_prob}" 294 | writer.add_scalar(f"{folder}/prob_key", key_prob, step) 295 | writer.add_scalar(f"{folder}/prob_minor", minor_prob, step) 296 | 297 | print(log) 298 | writer.add_scalar(f"{folder}/loss_prob", prob_loss, step) 299 | if pdf: 300 | writer.add_scalar(f"{folder}/pdf", pdf, step) 301 | 302 | spatial = 'D^2' if self.dist else 'Coord' 303 | processed_metric = train_metric[0:cur_batch, :] 304 | mean_metric = np.mean(processed_metric, axis=0) 305 | current_batch_metric = processed_metric[-1, :] 306 | # print(current_batch_metric) 307 | 308 | print(f"LOG\tCurrent step: {step}\tLR:\t{lr}") 309 | total_loss_log(current_batch_metric, "current") 310 | spat_loss_log(current_batch_metric, "current") 311 | if self.outputs_map["sigma"]: 312 | prob_loss_log(current_batch_metric, "current") 313 | 314 | print(f"LOG\tTRAIN\tMean metrics:") 315 | total_loss_log(mean_metric, "total") 316 | spat_loss_log(mean_metric, "total") 317 | if self.outputs_map["sigma"]: 318 | prob_loss_log(mean_metric, "total") 319 | 320 | if val_metric is not None: 321 | mean_val_metric = np.mean(val_metric, axis=0) 322 | 323 | print(f"LOG\tVAL\tMean metrics:") 324 | total_loss_log(mean_val_metric, "val") 325 | spat_loss_log(mean_val_metric, "val") 326 | if self.outputs_map["sigma"]: 327 | prob_loss_log(mean_val_metric, "val") 328 | 329 | writer.flush() 330 | -------------------------------------------------------------------------------- /utils/cosine_scheduler.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterable 2 | from math import log, cos, pi, floor 3 | 4 | from torch.optim.lr_scheduler import _LRScheduler 5 | 6 | # cosine scheduler fpr learning rate during training 7 | # source code: https://github.com/abhuse/cyclic-cosine-decay 8 | # author: @abhuse 9 | 10 | 11 | class CyclicCosineDecayLR(_LRScheduler): 12 | def __init__(self, 13 | optimizer, 14 | init_decay_epochs, 15 | min_decay_lr, 16 | restart_interval=None, 17 | restart_interval_multiplier=None, 18 | restart_lr=None, 19 | warmup_epochs=None, 20 | warmup_start_lr=None, 21 | last_epoch=-1, 22 | verbose=False): 23 | """ 24 | Initialize new CyclicCosineDecayLR object. 25 | :param optimizer: (Optimizer) - Wrapped optimizer. 26 | :param init_decay_epochs: (int) - Number of initial decay epochs. 27 | :param min_decay_lr: (float or iterable of floats) - Learning rate at the end of decay. 28 | :param restart_interval: (int) - Restart interval for fixed cycles. 29 | Set to None to disable cycles. Default: None. 30 | :param restart_interval_multiplier: (float) - Multiplication coefficient for geometrically increasing cycles. 31 | Default: None. 32 | :param restart_lr: (float or iterable of floats) - Learning rate when cycle restarts. 33 | If None, optimizer's learning rate will be used. Default: None. 34 | :param warmup_epochs: (int) - Number of warmup epochs. Set to None to disable warmup. Default: None. 35 | :param warmup_start_lr: (float or iterable of floats) - Learning rate at the beginning of warmup. 36 | Must be set if warmup_epochs is not None. Default: None. 37 | :param last_epoch: (int) - The index of the last epoch. This parameter is used when resuming a training job. Default: -1. 38 | :param verbose: (bool) - If True, prints a message to stdout for each update. Default: False. 39 | """ 40 | 41 | if not isinstance(init_decay_epochs, int) or init_decay_epochs < 1: 42 | raise ValueError("init_decay_epochs must be positive integer, got {} instead".format(init_decay_epochs)) 43 | 44 | if isinstance(min_decay_lr, Iterable) and len(min_decay_lr) != len(optimizer.param_groups): 45 | raise ValueError("Expected len(min_decay_lr) to be equal to len(optimizer.param_groups), " 46 | "got {} and {} instead".format(len(min_decay_lr), len(optimizer.param_groups))) 47 | 48 | if restart_interval is not None and (not isinstance(restart_interval, int) or restart_interval < 1): 49 | raise ValueError("restart_interval must be positive integer, got {} instead".format(restart_interval)) 50 | 51 | if restart_interval_multiplier is not None and \ 52 | (not isinstance(restart_interval_multiplier, float) or restart_interval_multiplier <= 0): 53 | raise ValueError("restart_interval_multiplier must be positive float, got {} instead".format( 54 | restart_interval_multiplier)) 55 | 56 | if isinstance(restart_lr, Iterable) and len(restart_lr) != len(optimizer.param_groups): 57 | raise ValueError("Expected len(restart_lr) to be equal to len(optimizer.param_groups), " 58 | "got {} and {} instead".format(len(restart_lr), len(optimizer.param_groups))) 59 | 60 | if warmup_epochs is not None: 61 | if not isinstance(warmup_epochs, int) or warmup_epochs < 1: 62 | raise ValueError( 63 | "Expected warmup_epochs to be positive integer, got {} instead".format(type(warmup_epochs))) 64 | 65 | if warmup_start_lr is None: 66 | raise ValueError("warmup_start_lr must be set when warmup_epochs is not None") 67 | 68 | if not (isinstance(warmup_start_lr, float) or isinstance(warmup_start_lr, Iterable)): 69 | raise ValueError("warmup_start_lr must be either float or iterable of floats, got {} instead".format( 70 | warmup_start_lr)) 71 | 72 | if isinstance(warmup_start_lr, Iterable) and len(warmup_start_lr) != len(optimizer.param_groups): 73 | raise ValueError("Expected len(warmup_start_lr) to be equal to len(optimizer.param_groups), " 74 | "got {} and {} instead".format(len(warmup_start_lr), len(optimizer.param_groups))) 75 | 76 | group_num = len(optimizer.param_groups) 77 | self._warmup_start_lr = [warmup_start_lr] * group_num if isinstance(warmup_start_lr, float) else warmup_start_lr 78 | self._warmup_epochs = 0 if warmup_epochs is None else warmup_epochs 79 | self._init_decay_epochs = init_decay_epochs 80 | self._min_decay_lr = [min_decay_lr] * group_num if isinstance(min_decay_lr, float) else min_decay_lr 81 | self._restart_lr = [restart_lr] * group_num if isinstance(restart_lr, float) else restart_lr 82 | self._restart_interval = restart_interval 83 | self._restart_interval_multiplier = restart_interval_multiplier 84 | super(CyclicCosineDecayLR, self).__init__(optimizer, last_epoch, verbose=verbose) 85 | 86 | def get_lr(self): 87 | 88 | if self._warmup_epochs > 0 and self.last_epoch < self._warmup_epochs: 89 | return self._calc(self.last_epoch, 90 | self._warmup_epochs, 91 | self._warmup_start_lr, 92 | self.base_lrs) 93 | 94 | elif self.last_epoch < self._init_decay_epochs + self._warmup_epochs: 95 | return self._calc(self.last_epoch - self._warmup_epochs, 96 | self._init_decay_epochs, 97 | self.base_lrs, 98 | self._min_decay_lr) 99 | else: 100 | if self._restart_interval is not None: 101 | if self._restart_interval_multiplier is None: 102 | cycle_epoch = (self.last_epoch - self._init_decay_epochs - self._warmup_epochs) % self._restart_interval 103 | lrs = self.base_lrs if self._restart_lr is None else self._restart_lr 104 | return self._calc(cycle_epoch, 105 | self._restart_interval, 106 | lrs, 107 | self._min_decay_lr) 108 | else: 109 | n = self._get_n(self.last_epoch - self._warmup_epochs - self._init_decay_epochs) 110 | sn_prev = self._partial_sum(n) 111 | cycle_epoch = self.last_epoch - sn_prev - self._warmup_epochs - self._init_decay_epochs 112 | interval = self._restart_interval * self._restart_interval_multiplier ** n 113 | lrs = self.base_lrs if self._restart_lr is None else self._restart_lr 114 | return self._calc(cycle_epoch, 115 | interval, 116 | lrs, 117 | self._min_decay_lr) 118 | else: 119 | return self._min_decay_lr 120 | 121 | def _calc(self, t, T, lrs, min_lrs): 122 | return [min_lr + (lr - min_lr) * ((1 + cos(pi * t / T)) / 2) 123 | for lr, min_lr in zip(lrs, min_lrs)] 124 | 125 | def _get_n(self, epoch): 126 | _t = 1 - (1 - self._restart_interval_multiplier) * epoch / self._restart_interval 127 | return floor(log(_t, self._restart_interval_multiplier)) 128 | 129 | def _partial_sum(self, n): 130 | return self._restart_interval * (1 - self._restart_interval_multiplier ** n) / ( 131 | 1 - self._restart_interval_multiplier) 132 | -------------------------------------------------------------------------------- /utils/prediction.py: -------------------------------------------------------------------------------- 1 | from transformers import BertTokenizer 2 | import torch 3 | import numpy as np 4 | from pathlib import Path 5 | from utils.twitter_dataset import * 6 | from utils.result_visuals import * 7 | 8 | # single text prediction wrapper 9 | # preprocessing and result visual output 10 | class ModelOutput(): 11 | def __init__(self, wrapper, model_prefix, local=False): 12 | self.prefix = model_prefix 13 | self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 14 | self.model = wrapper.model.to(self.device) 15 | self.local = local 16 | 17 | if self.local: 18 | local_model = f"models/final/{self.prefix}.pth" 19 | print(f"LOAD\tLoading local model from {local_model}") 20 | if not Path(local_model).is_file(): 21 | print(f"LOAD [ERROR] Unable to load local model: file {local_model} does not exist") 22 | 23 | state = torch.load(local_model) if torch.cuda.is_available() else torch.load(local_model, map_location='cpu') 24 | self.model.load_state_dict(state['model_state_dict']) 25 | 26 | self.outcomes = wrapper.n_outcomes 27 | self.cov = wrapper.cov 28 | self.weighted = wrapper.weighted 29 | self.feature = wrapper.features[0] 30 | self.tokenizer = BertTokenizer.from_pretrained(wrapper.original_model) 31 | self.benchmark = ModelBenchmark(wrapper, True, "pos", "mean", "mean" if self.cov else "type") 32 | 33 | self.result = None 34 | self.visual = None 35 | 36 | def prediction_output(self, text, filtering=True, visual=False): 37 | if filtering: 38 | text = nlp_filtering(text) 39 | print(f"TEXT\tFiltered text: {text}") 40 | 41 | self.result = ResultManager(None, text, self.feature, self.device, self.benchmark, False, False, self.prefix) 42 | 43 | if self.local: 44 | print("TEXT\tTokenizing text to input IDs and attention masks") 45 | encoded_corpus = self.tokenizer(text=text, 46 | add_special_tokens=True, 47 | padding='max_length', 48 | truncation='longest_first', 49 | max_length=300, 50 | return_attention_mask=True) 51 | input_id = encoded_corpus['input_ids'] 52 | attention_mask = encoded_corpus['attention_mask'] 53 | 54 | input = torch.tensor(input_id).to(self.device).reshape(1, -1) 55 | mask = torch.tensor(attention_mask).to(self.device).reshape(1, -1) 56 | 57 | self.model.eval() 58 | with torch.no_grad(): 59 | output = self.model(input, mask, self.feature) 60 | 61 | if self.cov: 62 | prob_model = self.benchmark.prob_models(output) 63 | 64 | output = output.cpu().numpy() if torch.cuda.is_available() else output.numpy() 65 | 66 | print(f"RESULT\tPost-processing raw model outputs: {output}") 67 | self.result.soft_outputs(list([prob_model])) if self.cov else self.result.coord_outputs(output) 68 | 69 | else: 70 | print("TEXT\tTokenizing text to input IDs and attention masks") 71 | inputs = self.tokenizer(text, return_tensors="pt") 72 | 73 | with torch.no_grad(): 74 | output = self.model(**inputs) 75 | prob_model = self.benchmark.prob_models(output) 76 | 77 | print(f"RESULT\tPost-processing raw model outputs: {output}") 78 | self.result.soft_outputs(list([prob_model])) 79 | 80 | if visual: 81 | self.visual = ResultVisuals(self.result) 82 | self.visual.text_map_result() 83 | 84 | return self.result 85 | -------------------------------------------------------------------------------- /utils/regressor.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from transformers import BertModel, BertPreTrainedModel, BertConfig 3 | 4 | # general model wrapper 5 | # linear regression fork for features and preset outputs 6 | class BERTregModel(): 7 | def __init__(self, n_outcomes=1, covariance=None, weighted=False, features=None, base_model_name=None, hub_model=None): 8 | self.n_outcomes = n_outcomes 9 | self.cov = covariance 10 | self.weighted = weighted 11 | self.features = ["NON-GEO"] if features is None else features 12 | 13 | print(f"MODEL\tInitializing BERT Regression model for {self.n_outcomes} outcome(s)") 14 | # features 15 | print(f"MODEL\tText features:\t{' + '.join(self.features)}") 16 | # longitude, latitude for n outcomes 17 | self.coord_output = self.n_outcomes * 2 18 | print(f"MODEL\tCoordinates:\t{self.coord_output}") 19 | # weights of gaussians 20 | self.weights_output = self.n_outcomes if self.weighted and self.n_outcomes > 1 else 0 21 | if self.weights_output > 0: 22 | print(f"MODEL\tWeights:\t{self.weights_output}") 23 | 24 | # covariance matrix 25 | self.covariances = {'spher': self.n_outcomes, 26 | 'diag': self.n_outcomes * 2, 27 | 'tied': 3, 28 | 'full': self.n_outcomes * 3} 29 | if self.cov is None: 30 | self.cov_output = 0 31 | print(f"MODEL\tNon-probabilistic model has been chosen") 32 | else: 33 | if self.cov not in self.covariances: 34 | self.cov = 'spher' 35 | self.cov_output = self.covariances[self.cov] 36 | print(f"MODEL\tCovariances:\t{self.cov_output}\tmatrix type:\t{self.cov}") 37 | 38 | self.original_model = "bert-base-multilingual-cased" if base_model_name is None else base_model_name 39 | print(f"MODEL\tOriginal model to load:\t{self.original_model}") 40 | 41 | self.key_output = self.coord_output + self.weights_output + self.cov_output 42 | self.minor_output = 2 43 | self.minor_output += 1 if self.cov_output > 0 else 0 44 | 45 | self.feature_outputs = {} 46 | for f in range(len(self.features)): 47 | if f == 0: 48 | output = self.key_output 49 | print(f"MODEL\tKey feature \t{self.features[f]} outputs:\t{output}") 50 | else: 51 | output = self.minor_output 52 | print(f"MODEL\tMinor feature\t{self.features[f]} outputs:\t{output}") 53 | self.feature_outputs[self.features[f]] = output 54 | 55 | if hub_model: 56 | self.model = GeoBertModel(BertConfig.from_pretrained(self.original_model), self.feature_outputs) 57 | print(f"LOAD\tLoading HF model from {hub_model}") 58 | self.model = self.model.from_pretrained(hub_model, self.feature_outputs) 59 | else: 60 | self.model = BertRegressor(self.original_model, self.feature_outputs) 61 | 62 | 63 | 64 | # Train model wrapper layer 65 | class BertRegressor(nn.Module): 66 | def __init__(self, model_name, feature_outputs): 67 | super(BertRegressor, self).__init__() 68 | self.bert = BertModel.from_pretrained(model_name, return_dict=True) 69 | self.feature_outputs = feature_outputs 70 | 71 | self.key_regressor = nn.Linear(768, list(self.feature_outputs.values())[0]) 72 | if len(self.feature_outputs) > 1: 73 | self.minor_regressor = nn.Linear(768, list(self.feature_outputs.values())[1]) 74 | 75 | def forward(self, input_ids, attention_masks, feature_name): 76 | outputs = self.bert(input_ids, attention_masks) 77 | if feature_name == list(self.feature_outputs.keys())[0]: 78 | outputs = self.key_regressor(outputs[1]) 79 | else: 80 | outputs = self.minor_regressor(outputs[1]) 81 | return outputs 82 | 83 | 84 | # HF model wrapper layer 85 | class GeoBertModel(BertPreTrainedModel): 86 | def __init__(self, config, feature_outputs): 87 | super().__init__(config) 88 | self.bert = BertModel(config) 89 | self.feature_outputs = feature_outputs 90 | 91 | self.key_regressor = nn.Linear(config.hidden_size, list(self.feature_outputs.values())[0]) 92 | if len(self.feature_outputs) > 1: 93 | self.minor_regressor = nn.Linear(config.hidden_size, list(self.feature_outputs.values())[1]) 94 | 95 | def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, feature_name=None): 96 | outputs = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask) 97 | pooler_output = outputs[1] 98 | if feature_name is None or feature_name == list(self.feature_outputs.keys())[0]: 99 | custom_output = self.key_regressor(pooler_output) 100 | else: 101 | custom_output = self.minor_regressor(pooler_output) 102 | return custom_output 103 | -------------------------------------------------------------------------------- /utils/twitter_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import torch 5 | from torch.utils.data import TensorDataset, DataLoader 6 | 7 | import string 8 | import re 9 | import os 10 | import datetime 11 | 12 | from sklearn.model_selection import train_test_split 13 | 14 | import GPUtil 15 | import psutil 16 | 17 | # dataset wrapper 18 | 19 | 20 | def load_jsonl(filename): 21 | filename = f"datasets/{filename}" 22 | print(f"DATASET\tLOAD: {psutil.virtual_memory().percent}%\tLoading dataset from {filename}") 23 | data = pd.read_json(path_or_buf=filename, lines=True) 24 | print(f"DATASET\tLOAD: {psutil.virtual_memory().percent}%\tDataset of {len(data.index)} samples and {len(data.user.unique())} users is loaded") 25 | return data 26 | 27 | 28 | def save_df(df, filename, prefix=None): 29 | if prefix is None: 30 | prefix = "td-" 31 | 32 | size = len(df.index) 33 | save_filename = f"datasets/{prefix}{size}-{filename}" 34 | 35 | with open(save_filename, "w") as f: 36 | df.to_json(f, orient='records', lines=True) 37 | print(f"DATASET\tSAVE\tTwitter Dataset of {size} samples is written to file: {save_filename}") 38 | 39 | 40 | def combine_datasets(file_list, filename, prefix=None): 41 | if prefix is None: 42 | prefix = "td-" 43 | 44 | df = load_jsonl(file_list[0]) 45 | for file in file_list[1:]: 46 | data = load_jsonl(file) 47 | df = df.append(data, ignore_index=True) 48 | 49 | save_df(df, filename, prefix) 50 | 51 | 52 | def filter_bots(data, min_total=1, max_day=20): 53 | print(f"DATASET\tFiltering dataset of {len(data['user'].unique())} users from bots posting more than {max_day} tweets per day") 54 | if "time" in data.columns: 55 | data['date'] = pd.to_datetime(data['time'], utc=False).dt.date 56 | user_tweets_per_day = data.groupby(['date'])['user'].value_counts() 57 | user_tweets = user_tweets_per_day[user_tweets_per_day < max_day].droplevel(0).groupby(["user"]).sum() 58 | else: 59 | user_tweets = data['user'].value_counts() 60 | 61 | # data["time"] = data['time'].apply(lambda x: datetime.datetime.combine(x, datetime.time.min).timestamp()) 62 | data['date'] = data['time'].apply(lambda x: datetime.datetime.strptime(x, '%a %b %d %H:%M:%S %z %Y').timestamp()) 63 | # data.drop("date", axis=1, inplace=True) 64 | data["date"] = data["date"].astype(float) 65 | 66 | user_list = user_tweets[user_tweets > min_total].index.tolist() 67 | data = data[data['user'].isin(user_list)] 68 | 69 | print(f"DATASET\tSize of the filtered dataset with {len(data['user'].unique())} users: {len(data.index)} samples") 70 | return data 71 | 72 | 73 | # text preprocessing 74 | def nlp_filtering(text): 75 | def filter_punctuation(text): 76 | punctuationfree="".join([i for i in text if i not in string.punctuation]) 77 | return punctuationfree 78 | 79 | def filter_websites(text): 80 | #pattern = r'(http\:\/\/|https\:\/\/)?([a-z0-9][a-z0-9\-]*\.)+[a-z][a-z\-]*' 81 | pattern = r'http\S+' 82 | text = re.sub(pattern, '', text) 83 | return text 84 | 85 | text = filter_websites(text) 86 | text = filter_punctuation(text) 87 | return text 88 | 89 | 90 | def create_dataloader(inputs, masks, labels, batch_size, shuffle=False): 91 | dataset = TensorDataset(torch.tensor(inputs), torch.tensor(masks), torch.tensor(labels)) 92 | return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle) 93 | 94 | 95 | # crop dataset to remove already used data before random sampling 96 | def crop_dataset(data, size, seed, by_user=False, ref_file=None, save=False): 97 | print(f"DATASET\tCropping dataset of {len(data.index)} in {size} samples with seed {seed}{' including uniqie users' if by_user else ''}") 98 | if by_user: 99 | if ref_file: 100 | df = load_jsonl(ref_file) 101 | crop_data = df.sample(n=size, random_state=seed) 102 | else: 103 | crop_data = data.sample(n=size, random_state=seed) 104 | data = data.drop(crop_data.index, axis=0) 105 | 106 | crop_users = crop_data["user"].unique() 107 | 108 | print(f"DATASET\tUnique users to crop: {len(crop_users)}") 109 | data = data[-data["user"].isin(crop_users)] 110 | print(f"DATASET\tUnique users left: {len(data['user'].unique())}") 111 | else: 112 | crop_data = data.sample(n=size, random_state=seed) 113 | data = data.drop(crop_data.index, axis=0) 114 | 115 | if save and ref_file: 116 | save_df(crop_data, ref_file, "train-") 117 | 118 | print(f"DATASET\tReduced dataset lenght is {len(data.index)}") 119 | return data 120 | 121 | 122 | # sample users (bots filtering) for evaluation 123 | def sample_users(data, users_n, seed=42): 124 | user_tweets = data['user'].value_counts() 125 | user_list = user_tweets.sample(n=users_n, random_state=seed).index.tolist() 126 | return data[data['user'].isin(user_list)] 127 | 128 | 129 | class TwitterDataloader(): 130 | def __init__(self, filename, features, target, tokenizer, seed=42, scaled=False, val_feature=None, bot_filter=False): 131 | self.filename = filename 132 | self.data = load_jsonl(self.filename) 133 | if "texts" in self.data.columns: 134 | self.data.rename(columns={'longitude': 'lon', 135 | 'latitude': 'lat', 136 | 'created_at': 'time', 137 | 'texts': 'text'}, inplace=True) 138 | self.data.to_json(f"datasets/{self.filename}", orient='records', lines=True) 139 | print(self.data.info()) 140 | 141 | if bot_filter: 142 | self.data = filter_bots(self.data) 143 | save_df(self.data, self.filename, "filtered-") 144 | 145 | self.target = target 146 | 147 | self.tokenizer = tokenizer 148 | self.seed = seed 149 | 150 | self.scaled = scaled 151 | 152 | self.feature_columns = { 153 | "GEO": ["text", "place", "user"], 154 | "NON-GEO": ["text", "user"], 155 | "NON-USER": ["text", "place"], 156 | "META-DATA": ["place", "user"], 157 | "TEXT-ONLY": ["text"], 158 | "GEO-ONLY": ["place"], 159 | "USER-ONLY": ["user"], 160 | } 161 | 162 | self.features = features 163 | self.n_features = len(features) 164 | self.key_feature = features[0] 165 | self.minor_features = features[1:] 166 | 167 | self.val_feature = features[0] if val_feature is None else val_feature 168 | 169 | self.val_dataloader, self.train_dataloader, self.test_dataloader = None, None, None 170 | self.val_df = None 171 | 172 | # filtering dataset files by column condition 173 | def filter_dataset(self, column_name, filter_text=None, filter_list=None, save=True): 174 | if filter_text: 175 | filter_df = self.data[self.data[column_name] == filter_text] 176 | prefix = f"f-{column_name}-{filter_text}-td-" 177 | elif filter_list: 178 | filter_df = self.data[self.data[column_name].isin(filter_list)] 179 | prefix = f"f-{column_name}-{'+'.join(filter_list)}-td-" 180 | 181 | if save: 182 | save_df(filter_df, self.filename, prefix) 183 | 184 | # tokenization - text to IDs and attention masks 185 | def tokenize(self, column): 186 | encoded_corpus = self.tokenizer(text=column, 187 | add_special_tokens=True, 188 | padding='max_length', 189 | truncation='longest_first', 190 | max_length=300, 191 | return_attention_mask=True) 192 | 193 | input_ids = encoded_corpus['input_ids'] 194 | attention_masks = encoded_corpus['attention_mask'] 195 | return input_ids, attention_masks 196 | 197 | # forming feature columns, dropping old columns 198 | def feature_split_filter(self, data): 199 | text_features = self.features + [self.val_feature] if self.val_feature not in self.features else self.features 200 | for f in text_features: 201 | data[f] = data[self.feature_columns[f]].astype(str).agg(" ".join, axis=1) if len(self.feature_columns[f]) > 1 else data[self.feature_columns[f]] 202 | data[f] = data[f].astype(str).apply(nlp_filtering) 203 | 204 | for f in text_features: 205 | for column in self.feature_columns[f]: 206 | if column in data.columns: 207 | data = data.drop(columns=[column], axis=1) 208 | return data 209 | 210 | def form_training(self, batch_size, size, test_ratio, skip_size=0, shuffle=True): 211 | if skip_size > 0: 212 | self.data = crop_dataset(self.data, skip_size, self.seed) 213 | 214 | print(f"DATASET\tForming training dataset of {size} samples with test size {int(test_ratio*size)} for features: {', '.join(self.features)}") 215 | train_df = self.data.sample(n=size, random_state=self.seed).copy() 216 | del self.data 217 | train_df = self.feature_split_filter(train_df) 218 | self.create_feature_dataloaders(train_df, True, batch_size, shuffle, test_ratio) 219 | 220 | def form_validation(self, batch_size, size, by_user=False, skip_size=0, ref_train_file=None): 221 | if skip_size > 0: 222 | self.data = crop_dataset(self.data, skip_size, self.seed, by_user, ref_train_file, True) 223 | 224 | # self.data = filter_bots(self.data) 225 | save_df(self.data, self.filename, "test-") 226 | 227 | print(f"DATASET\tForming validation dataset of {size} {'users' if by_user else 'samples'} with batch size {batch_size} for {self.val_feature} text feature") 228 | if by_user: 229 | self.val_df = sample_users(self.data, size, seed=self.seed).copy() 230 | self.features += ["USER-ONLY"] 231 | else: 232 | self.val_df = self.data.sample(n=size, random_state=self.seed).copy() 233 | del self.data 234 | 235 | print(f"DATASET\tSize of the validation dataset with {len(self.val_df['user'].unique())} users: {len(self.val_df.index)} samples") 236 | 237 | self.val_df = self.feature_split_filter(self.val_df) 238 | self.create_feature_dataloaders(self.val_df, False, batch_size) 239 | 240 | # training and evaluation dataloaders formation 241 | def create_feature_dataloaders(self, df, train, batch_size, shuffle=False, test_ratio=None): 242 | if train: 243 | train_index, test_index = train_test_split(df.index, test_size=test_ratio, random_state=self.seed) 244 | train_index, test_index = list(train_index), list(test_index) 245 | train_size, test_size = len(train_index), len(test_index) 246 | train_inputs, train_masks = np.empty((train_size, 0)), np.empty((train_size, 0)) 247 | 248 | for feature in self.features: 249 | input_ids, attention_mask = self.tokenize(df.loc[train_index, feature].tolist()) 250 | train_inputs, train_masks = np.concatenate((train_inputs, input_ids), axis=1), np.concatenate((train_masks, attention_mask), axis=1) 251 | 252 | test_inputs, test_masks = self.tokenize(df.loc[test_index, self.val_feature].tolist()) 253 | if self.scaled: 254 | train_labels = np.reshape(np.multiply(df.loc[train_index, self.target].to_numpy(), 0.01), (train_size, 2)) 255 | test_labels = np.reshape(np.multiply(df.loc[test_index, self.target].to_numpy(), 0.01), (test_size, 2)) 256 | else: 257 | train_labels = np.reshape(df.loc[train_index, self.target].to_numpy(), (train_size, 2)) 258 | test_labels = np.reshape(df.loc[test_index, self.target].to_numpy(), (test_size, 2)) 259 | 260 | self.train_dataloader = create_dataloader(np.reshape(train_inputs, (train_size, self.n_features, 300)), 261 | np.reshape(train_masks, (train_size, self.n_features, 300)), train_labels, batch_size, shuffle) 262 | self.test_dataloader = create_dataloader(test_inputs, test_masks, test_labels, batch_size, shuffle) 263 | del df 264 | else: 265 | labels = df[self.target].to_numpy() 266 | if self.scaled: 267 | labels = np.multiply(labels, 0.01) 268 | val_inputs, val_masks = self.tokenize(df[self.val_feature].tolist()) 269 | 270 | self.val_dataloader = create_dataloader(val_inputs, val_masks, labels, batch_size, shuffle) 271 | -------------------------------------------------------------------------------- /valid_data.py: -------------------------------------------------------------------------------- 1 | from utils.result_visuals import * 2 | from utils.regressor import * 3 | import torch 4 | 5 | # results manager and visual test on evaluated datasets 6 | ww = "bert-base-multilingual-cased" 7 | us = "bert-base-cased" 8 | 9 | feature = "NON-GEO" 10 | file = f"U-NON-GEO+GEO-ONLY-O1-d-total_mean-mf_mean-pos_spher-N30e5-B10-E3-cosine-LR[1e-05;1e-06]_predicted_N300000_VF-NON-GEO_2023-02-19" 11 | 12 | input_pred = f"results/val-data/{file}.jsonl" 13 | 14 | # output_pred = f"results/val-data/{file}-out.jsonl" 15 | # output_map_point = f"img/map-test-{feature}.png" 16 | # output_map_line = f"img/map-test-{feature}.png" 17 | # output_dist = f"img/dist-test-{feature}.png" 18 | 19 | if torch.cuda.is_available(): 20 | device = torch.device("cuda") 21 | print(f"Available GPU has {torch.cuda.device_count()} devices, using {torch.cuda.get_device_name(0)}") 22 | else: 23 | print(f"No GPU available, using the CPU with {torch.get_num_threads()} threads instead.") 24 | device = torch.device("cpu") 25 | 26 | bert_wrapper = BERTregModel(n_outcomes=1, covariance="spher", weighted=False, features=["NON-GEO", "GEO-ONLY"], model_name=ww) 27 | model = ModelBenchmark(bert_wrapper, distance=True, loss_prob="pos", mf_loss="mean", total_loss="mean") 28 | result = ResultManager(None, None, feature, device, model, scaled=False, by_user=False, prefix=file) 29 | result.load_df(input_pred) 30 | 31 | # metrics 32 | # result.result_metrics(True, 100) 33 | # result.result_metrics(False, 100) 34 | 35 | result.performance() 36 | 37 | # visual = ResultVisuals(result) 38 | 39 | # standard 40 | # visual.density() 41 | # visual.cum_dist(False, 161) 42 | 43 | # GMM 44 | # visual.summarize_prediction(1) 45 | # visual.gaus_map() 46 | # visual.prob_map_animation(228) 47 | 48 | # visual.interactive_map(lines=False, best=True) 49 | 50 | # result.save_df() 51 | 52 | 53 | 54 | --------------------------------------------------------------------------------