├── annotation-program ├── templates │ ├── index.html │ ├── login.html │ ├── user_stats.html │ ├── tag.html │ ├── home.html │ ├── base.html │ ├── register.html │ └── annotate.html ├── migrations │ ├── README │ ├── __pycache__ │ │ └── env.cpython-39.pyc │ ├── versions │ │ ├── __pycache__ │ │ │ └── 164cfc37a367_added_similarity_model.cpython-39.pyc │ │ └── 164cfc37a367_added_similarity_model.py │ ├── script.py.mako │ ├── alembic.ini │ └── env.py └── app.py ├── classification ├── model │ ├── svm_model.sav │ ├── binary_nb_model.sav │ ├── binary_svm_model.sav │ ├── multi_nb_model.sav │ ├── multi_svm_model.sav │ ├── binary_vectorizer_model.sav │ ├── multi_vectorizer_model.sav │ └── binary_svm_vectorizer_model.sav └── sklearn_classifier.py ├── recommender ├── LTR_resources │ ├── emb_pattern.pkl │ ├── emb_pattern_title.pkl │ ├── emb_pattern_excerpt.pkl │ ├── emb_pattern_overflow.pkl │ ├── emb_pattern_title_overflow.pkl │ └── emb_pattern_excerpt_overflow.pkl ├── static │ └── shap_plots │ │ ├── pattern_0.png │ │ ├── pattern_1.png │ │ ├── pattern_2.png │ │ ├── pattern_4.png │ │ ├── pattern_5.png │ │ ├── pattern_7.png │ │ ├── pattern_9.png │ │ ├── pattern_10.png │ │ ├── pattern_11.png │ │ ├── pattern_14.png │ │ ├── pattern_16.png │ │ ├── pattern_18.png │ │ ├── pattern_22.png │ │ ├── pattern_25.png │ │ ├── pattern_27.png │ │ ├── pattern_29.png │ │ ├── pattern_30.png │ │ ├── pattern_31.png │ │ ├── pattern_36.png │ │ ├── pattern_37.png │ │ ├── pattern_38.png │ │ ├── pattern_40.png │ │ ├── pattern_41.png │ │ ├── pattern_42.png │ │ ├── pattern_43.png │ │ ├── pattern_44.png │ │ ├── pattern_45.png │ │ ├── pattern_47.png │ │ ├── pattern_49.png │ │ ├── pattern_51.png │ │ ├── pattern_52.png │ │ ├── pattern_53.png │ │ ├── pattern_54.png │ │ ├── pattern_55.png │ │ ├── pattern_56.png │ │ ├── pattern_57.png │ │ ├── pattern_59.png │ │ ├── pattern_61.png │ │ ├── pattern_63.png │ │ ├── pattern_64.png │ │ ├── pattern_66.png │ │ ├── pattern_72.png │ │ ├── waterfall_pattern_0.png │ │ ├── waterfall_pattern_1.png │ │ ├── waterfall_pattern_2.png │ │ ├── waterfall_pattern_4.png │ │ ├── waterfall_pattern_5.png │ │ ├── waterfall_pattern_7.png │ │ ├── waterfall_pattern_9.png │ │ ├── pattern_Lawful Consent.png │ │ ├── pattern_Onion Routing.png │ │ ├── pattern_Private link.png │ │ ├── waterfall_pattern_10.png │ │ ├── waterfall_pattern_11.png │ │ ├── waterfall_pattern_14.png │ │ ├── waterfall_pattern_16.png │ │ ├── waterfall_pattern_18.png │ │ ├── waterfall_pattern_22.png │ │ ├── waterfall_pattern_25.png │ │ ├── waterfall_pattern_27.png │ │ ├── waterfall_pattern_29.png │ │ ├── waterfall_pattern_30.png │ │ ├── waterfall_pattern_31.png │ │ ├── waterfall_pattern_36.png │ │ ├── waterfall_pattern_37.png │ │ ├── waterfall_pattern_38.png │ │ ├── waterfall_pattern_40.png │ │ ├── waterfall_pattern_41.png │ │ ├── waterfall_pattern_42.png │ │ ├── waterfall_pattern_43.png │ │ ├── waterfall_pattern_44.png │ │ ├── waterfall_pattern_45.png │ │ ├── waterfall_pattern_47.png │ │ ├── waterfall_pattern_49.png │ │ ├── waterfall_pattern_51.png │ │ ├── waterfall_pattern_52.png │ │ ├── waterfall_pattern_53.png │ │ ├── waterfall_pattern_54.png │ │ ├── waterfall_pattern_55.png │ │ ├── waterfall_pattern_56.png │ │ ├── waterfall_pattern_57.png │ │ ├── waterfall_pattern_59.png │ │ ├── waterfall_pattern_61.png │ │ ├── waterfall_pattern_63.png │ │ ├── waterfall_pattern_64.png │ │ ├── waterfall_pattern_66.png │ │ ├── waterfall_pattern_72.png │ │ ├── pattern_Personal Data Store.png │ │ ├── pattern_Psuedonymous Identity.png │ │ ├── pattern_Selective access control.png │ │ ├── pattern_Single Point of Contact.png │ │ ├── pattern_Obtaining Explicit Consent.png │ │ ├── pattern_Active broadcast of presence.png │ │ ├── pattern_Attribute Based Credentials.png │ │ ├── pattern_Protection against Tracking.png │ │ ├── pattern_[Support] Selective Disclosure.png │ │ ├── pattern_Encryption with user-managed keys.png │ │ ├── pattern_Anonymous Reputation-based Blacklisting.png │ │ └── pattern_Decoupling [content] and location information visibility.png ├── classification_model │ ├── multi_nb_model.sav │ ├── binary_nb_model.sav │ ├── binary_vectorizer_model.sav │ └── multi_vectorizer_model.sav ├── __pycache__ │ └── feature_creation.cpython-39.pyc ├── README.md ├── predict_console.py ├── templates │ └── index.html ├── data │ └── patterns_new.json ├── app.py └── feature_creation.py ├── README.md └── letor ├── feature_importance.py ├── lightgbm_ltr_train.py ├── compute_shap.py ├── case_studies.py └── feature_creation.py /annotation-program/templates/index.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /annotation-program/migrations/README: -------------------------------------------------------------------------------- 1 | Single-database configuration for Flask. 2 | -------------------------------------------------------------------------------- /classification/model/svm_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/svm_model.sav -------------------------------------------------------------------------------- /classification/model/binary_nb_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_nb_model.sav -------------------------------------------------------------------------------- /classification/model/binary_svm_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_svm_model.sav -------------------------------------------------------------------------------- /classification/model/multi_nb_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/multi_nb_model.sav -------------------------------------------------------------------------------- /classification/model/multi_svm_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/multi_svm_model.sav -------------------------------------------------------------------------------- /recommender/LTR_resources/emb_pattern.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern.pkl -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_0.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_1.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_2.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_4.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_5.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_7.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_9.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_10.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_11.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_14.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_16.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_18.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_22.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_25.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_27.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_29.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_30.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_31.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_36.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_37.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_38.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_40.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_41.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_42.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_43.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_44.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_45.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_47.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_49.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_51.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_51.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_52.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_52.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_53.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_54.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_54.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_55.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_55.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_56.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_56.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_57.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_59.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_59.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_61.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_61.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_63.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_63.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_64.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_66.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_72.png -------------------------------------------------------------------------------- /classification/model/binary_vectorizer_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_vectorizer_model.sav -------------------------------------------------------------------------------- /classification/model/multi_vectorizer_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/multi_vectorizer_model.sav -------------------------------------------------------------------------------- /recommender/LTR_resources/emb_pattern_title.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_title.pkl -------------------------------------------------------------------------------- /recommender/LTR_resources/emb_pattern_excerpt.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_excerpt.pkl -------------------------------------------------------------------------------- /recommender/LTR_resources/emb_pattern_overflow.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_overflow.pkl -------------------------------------------------------------------------------- /recommender/classification_model/multi_nb_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/multi_nb_model.sav -------------------------------------------------------------------------------- /classification/model/binary_svm_vectorizer_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_svm_vectorizer_model.sav -------------------------------------------------------------------------------- /recommender/classification_model/binary_nb_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/binary_nb_model.sav -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_0.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_1.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_2.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_4.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_5.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_7.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_9.png -------------------------------------------------------------------------------- /recommender/LTR_resources/emb_pattern_title_overflow.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_title_overflow.pkl -------------------------------------------------------------------------------- /recommender/__pycache__/feature_creation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/__pycache__/feature_creation.cpython-39.pyc -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Lawful Consent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Lawful Consent.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Onion Routing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Onion Routing.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Private link.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Private link.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_10.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_11.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_14.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_16.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_18.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_22.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_25.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_27.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_29.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_30.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_31.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_36.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_37.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_38.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_40.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_41.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_42.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_43.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_44.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_45.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_47.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_49.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_51.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_51.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_52.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_52.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_53.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_54.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_54.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_55.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_55.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_56.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_56.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_57.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_59.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_59.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_61.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_61.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_63.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_63.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_64.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_66.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/waterfall_pattern_72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_72.png -------------------------------------------------------------------------------- /recommender/LTR_resources/emb_pattern_excerpt_overflow.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_excerpt_overflow.pkl -------------------------------------------------------------------------------- /annotation-program/migrations/__pycache__/env.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/annotation-program/migrations/__pycache__/env.cpython-39.pyc -------------------------------------------------------------------------------- /recommender/classification_model/binary_vectorizer_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/binary_vectorizer_model.sav -------------------------------------------------------------------------------- /recommender/classification_model/multi_vectorizer_model.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/multi_vectorizer_model.sav -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Personal Data Store.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Personal Data Store.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Psuedonymous Identity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Psuedonymous Identity.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Selective access control.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Selective access control.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Single Point of Contact.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Single Point of Contact.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Obtaining Explicit Consent.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Obtaining Explicit Consent.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Active broadcast of presence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Active broadcast of presence.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Attribute Based Credentials.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Attribute Based Credentials.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Protection against Tracking.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Protection against Tracking.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_[Support] Selective Disclosure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_[Support] Selective Disclosure.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Encryption with user-managed keys.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Encryption with user-managed keys.png -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Anonymous Reputation-based Blacklisting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Anonymous Reputation-based Blacklisting.png -------------------------------------------------------------------------------- /recommender/README.md: -------------------------------------------------------------------------------- 1 | # Privacy Design Pattern Recommender 2 | 3 | Run the recommender program by executing `python app.py`. 4 | 5 | ## Prerequisites 6 | Install the following Python libraries: 7 | - lightgbm 8 | - shap 9 | - flask -------------------------------------------------------------------------------- /recommender/static/shap_plots/pattern_Decoupling [content] and location information visibility.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Decoupling [content] and location information visibility.png -------------------------------------------------------------------------------- /annotation-program/migrations/versions/__pycache__/164cfc37a367_added_similarity_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/annotation-program/migrations/versions/__pycache__/164cfc37a367_added_similarity_model.cpython-39.pyc -------------------------------------------------------------------------------- /annotation-program/migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning-to-Rank Privacy Design Pattern 2 | 3 | Supplementary repository for "Learning to Rank Privacy Design Patterns: A Semantic Approach to Meeting Privacy Requirements" paper. 4 | 5 | ## Repository Structure 6 | - **annotation-program**: Contains annotation software and `annotation_result.json` for replicating relevance assessments. 7 | - **classification**: Code for text classification essential for feature engineering in LTR. 8 | - **letor**: Core code for the learning-to-rank process. 9 | - **recommender**: Flask program interfacing the recommender system. 10 | 11 | ## Training Data Access 12 | The training data, including Hadamard product and concatenation embedding, is available at [https://bit.ly/letor_priv](https://bit.ly/letor_priv) for download. -------------------------------------------------------------------------------- /annotation-program/templates/login.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}Welcome{% endblock %} 4 | 5 | {% block content %} 6 | 7 |
8 |

Login

9 | {% for message in get_flashed_messages() %} 10 |
{{ message }}
11 | {% endfor %} 12 | 13 |
14 |
15 | 16 | 17 |
18 |
19 | 20 | 21 |
22 | 23 |
24 | Register 25 |
26 | 27 | {% endblock %} -------------------------------------------------------------------------------- /annotation-program/migrations/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # template used to generate migration files 5 | # file_template = %%(rev)s_%%(slug)s 6 | 7 | # set to 'true' to run the environment during 8 | # the 'revision' command, regardless of autogenerate 9 | # revision_environment = false 10 | 11 | 12 | # Logging configuration 13 | [loggers] 14 | keys = root,sqlalchemy,alembic,flask_migrate 15 | 16 | [handlers] 17 | keys = console 18 | 19 | [formatters] 20 | keys = generic 21 | 22 | [logger_root] 23 | level = WARN 24 | handlers = console 25 | qualname = 26 | 27 | [logger_sqlalchemy] 28 | level = WARN 29 | handlers = 30 | qualname = sqlalchemy.engine 31 | 32 | [logger_alembic] 33 | level = INFO 34 | handlers = 35 | qualname = alembic 36 | 37 | [logger_flask_migrate] 38 | level = INFO 39 | handlers = 40 | qualname = flask_migrate 41 | 42 | [handler_console] 43 | class = StreamHandler 44 | args = (sys.stderr,) 45 | level = NOTSET 46 | formatter = generic 47 | 48 | [formatter_generic] 49 | format = %(levelname)-5.5s [%(name)s] %(message)s 50 | datefmt = %H:%M:%S 51 | -------------------------------------------------------------------------------- /annotation-program/migrations/versions/164cfc37a367_added_similarity_model.py: -------------------------------------------------------------------------------- 1 | """Added Similarity model 2 | 3 | Revision ID: 164cfc37a367 4 | Revises: 5 | Create Date: 2023-10-27 09:37:11.624391 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '164cfc37a367' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('similarity', 22 | sa.Column('id', sa.Integer(), nullable=False), 23 | sa.Column('inquery_id', sa.Integer(), nullable=False), 24 | sa.Column('other_inquery_id', sa.Integer(), nullable=False), 25 | sa.Column('score', sa.Float(), nullable=False), 26 | sa.ForeignKeyConstraint(['inquery_id'], ['inquery.id'], ), 27 | sa.PrimaryKeyConstraint('id') 28 | ) 29 | # ### end Alembic commands ### 30 | 31 | 32 | def downgrade(): 33 | # ### commands auto generated by Alembic - please adjust! ### 34 | op.drop_table('similarity') 35 | # ### end Alembic commands ### 36 | -------------------------------------------------------------------------------- /annotation-program/templates/user_stats.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 |
5 |

Annotated Data

6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | {% for annotation in annotations_data %} 17 | 18 | 19 | 20 | 25 | 26 | 27 | {% endfor %} 28 | 29 |
#QueryCandidates (Relevance)Timestamp
{{ loop.index }}{{ annotation.query }} 21 | {% for candidate, relevance in annotation.candidates %} 22 | {{ candidate }} ({{ relevance }}){% if not loop.last %}, {% endif %} 23 | {% endfor %} 24 | {{ annotation.timestamp }}
30 |
31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /annotation-program/templates/tag.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block content %} 4 |
5 |

Tag: {{ tag.name }}

6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | {% for candidate in candidates %} 17 | 18 | 19 | 20 | 21 | 26 | 27 | {% endfor %} 28 | 29 |
Query TextDescriptionSourceTags
{{ candidate.text }}{{ candidate.description }}source 22 | {% for tag in candidate.tags %} 23 | {{ tag.name }}{% if not loop.last %}, {% endif %} 24 | {% endfor %} 25 |
30 |
31 | {% endblock %} 32 | -------------------------------------------------------------------------------- /annotation-program/templates/home.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %} 4 | Welcome 5 | {% endblock %} 6 | 7 | {% block content %} 8 |
9 |
10 |
11 |

Privacy Requirements Collection

12 |

Welcome to the comprehensive collection of privacy requirements and design patterns

13 |
14 |
15 | 16 | {% for annotation in annotations_data %} 17 |
18 |
19 |

{{ annotation.source }}

20 | Start Annotation for {{ annotation.source }} 21 |
22 |
23 |
24 | {% set progress = annotation.done / annotation.total * 100 %} 25 |
29 | {{ progress|round(2) }}% 30 |
31 |
32 |

You've completed {{ annotation.done }} out of {{ annotation.total }} annotations for source "{{ annotation.source }}".

33 |
34 |
35 | {% endfor %} 36 |
37 | {% endblock %} 38 | -------------------------------------------------------------------------------- /annotation-program/templates/base.html: -------------------------------------------------------------------------------- 1 | {% extends "bootstrap/base.html" %} 2 | 3 | 4 | {% block navbar %} 5 | 38 | 39 | {% with messages = get_flashed_messages(with_categories=true) %} 40 | {% if messages %} 41 |
42 | {% for category, message in messages %} 43 |
44 | {{ message }} 45 |
46 | {% endfor %} 47 |
48 | {% endif %} 49 | {% endwith %} 50 | 51 | {% endblock %} 52 | 53 | -------------------------------------------------------------------------------- /annotation-program/templates/register.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block title %}Welcome{% endblock %} 4 | 5 | {% block content %} 6 | 7 |
8 |

Register

9 | {% for message in get_flashed_messages() %} 10 |
{{ message }}
11 | {% endfor %} 12 |
13 | 14 |
15 | 16 | 17 |
18 |
19 | 20 | 21 |
22 |
23 | 24 | 25 |
26 |
27 | 28 | 29 |
30 | 31 |
32 | 33 | 34 |
35 | 36 |
37 |
38 | Already have an account? Login 39 |
40 | 41 | {% endblock %} 42 | 43 | -------------------------------------------------------------------------------- /letor/feature_importance.py: -------------------------------------------------------------------------------- 1 | import lightgbm as lgb 2 | import numpy as np 3 | import pandas as pd 4 | 5 | base_path="train/" 6 | 7 | # Function to read and extract feature indices from LightSVM formatted data 8 | def get_feature_indices(files): 9 | all_features = set() 10 | for file_name in files: 11 | with open(base_path + file_name, 'r') as file: 12 | for line in file: 13 | # Remove the comment part of the line if it exists 14 | line = line.split('#')[0].strip() 15 | # Skip the label and qid, then extract feature indices 16 | tokens = line.strip().split()[2:] # Skip the label and qid 17 | features = {int(tok.split(':')[0]) for tok in tokens if ':' in tok} 18 | all_features.update(features) 19 | return all_features 20 | 21 | # Paths to your testing data files 22 | train_files = [ 23 | 'test_fold_1.txt' 24 | ] 25 | 26 | # Get all feature indices 27 | all_feature_indices = get_feature_indices(train_files) 28 | num_total_features = max(all_feature_indices) 29 | 30 | # Load your models and calculate feature importances 31 | model_files = [ 32 | 'model_fold_1.txt', 33 | 'model_fold_2.txt', 34 | 'model_fold_3.txt', 35 | 'model_fold_4.txt', 36 | 'model_fold_5.txt' 37 | ] 38 | 39 | # Initialize a dictionary to store the feature importances from all folds 40 | feature_importances = {f'f{i}': [] for i in range(1, num_total_features + 1)} 41 | 42 | # Load each model and gather the feature importances 43 | for model_file in model_files: 44 | bst = lgb.Booster(model_file=base_path + model_file) # Load the model 45 | fold_importance = bst.feature_importance(importance_type='gain') 46 | # Store the feature importances for the fold 47 | for i, importance in enumerate(fold_importance, start=1): 48 | feature_importances[f'f{i}'].append(importance) 49 | 50 | # Calculate the average importance for each feature 51 | average_importances = {feature: np.mean(importances) for feature, importances in feature_importances.items()} 52 | 53 | # Convert to a DataFrame for easier manipulation and saving to Excel 54 | importance_df = pd.DataFrame.from_dict(average_importances, orient='index', columns=['Average Importance']) 55 | importance_df.index.name = 'Feature' 56 | 57 | # Sort the DataFrame by the feature importances 58 | importance_df = importance_df.sort_values(by='Average Importance', ascending=False) 59 | 60 | # Save to an Excel file 61 | importance_df.to_excel(base_path + 'feature_importances.xlsx') 62 | 63 | print("Feature importances have been calculated and saved to feature_importances.xlsx") 64 | -------------------------------------------------------------------------------- /recommender/predict_console.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import lightgbm as lgb 4 | from feature_creation import PrivacyPatternFeatures 5 | 6 | pp = PrivacyPatternFeatures() 7 | 8 | with open("data/patterns.json", 'r') as p: 9 | patterns = json.loads(p.read()) 10 | 11 | pattern_name = [pattern["title"].replace(".md", "") for i, pattern in enumerate(patterns)] 12 | 13 | with open("data/patterns_new.json", 'r') as p: 14 | patterns_new = json.loads(p.read()) 15 | 16 | for p_new in patterns_new: 17 | patterns.append(p_new) 18 | pattern_name.append(p_new["title"]) 19 | 20 | def process_new_requirement(new_req_text): 21 | """ 22 | Process a new privacy requirement text to create a feature vector. 23 | 24 | Parameters: 25 | - new_req_text (str): The new privacy requirement text. 26 | - patterns (list): The list of patterns (already loaded from the patterns file). 27 | - pp (PrivacyPatternFeatures): The PrivacyPatternFeatures instance. 28 | - pattern_name (list): List of pattern names extracted from the patterns file. 29 | 30 | Returns: 31 | - A list of feature vectors for the new requirement text. 32 | """ 33 | features = pp.construct_features(new_req_text) 34 | feature_vectors = [] 35 | 36 | for idx, pattern in enumerate(patterns): 37 | feature_vector = features[idx] # Assuming features[idx] is already a list of features 38 | feature_vectors.append(feature_vector) 39 | 40 | return feature_vectors 41 | 42 | 43 | def predict_new_data(model, new_req_text): 44 | """ 45 | Predict the ranking for a new privacy requirement text using the trained model. 46 | 47 | Parameters: 48 | - model: The trained LightGBM model. 49 | - new_req_text (str): The new privacy requirement text. 50 | 51 | Returns: 52 | - The predictions and the sorted pattern names based on their rankings. 53 | """ 54 | 55 | # Process the new requirement text 56 | feature_vectors = process_new_requirement(new_req_text) 57 | data_matrix = np.array(feature_vectors) 58 | 59 | # Make predictions using the model 60 | predictions = model.predict(data_matrix) 61 | 62 | # Rank the pattern names based on predictions 63 | sorted_indices = np.argsort(predictions)[::-1] 64 | sorted_patterns = [pattern_name[i] for i in sorted_indices] 65 | 66 | return predictions, sorted_patterns 67 | 68 | # Example usage: 69 | new_req_text = """ 70 | 71 | 72 | If I have written a privacy tool as a .NET web application which is to be hosted on a commercial hosting site, other than hosting it in a privacy friendly country, how can I assure users that the application has not been compromised by a third party at the host? 73 | 74 | Obviously SSL will be used and the assemblies will be as obfuscated as possible, but these can only go so far. 75 | 76 | For example, is there a way I can ensure that my assemblies haven't been wrapped to intercept plain-text user details? 77 | 78 | """ 79 | model_file_path = "LTR_resources/model_fold_4_train_3_law.txt" # Provide the correct path to your trained model 80 | 81 | # Load the trained model 82 | bst = lgb.Booster(model_file=model_file_path) 83 | 84 | # Assuming 'pp' (PrivacyPatternFeatures instance) and 'pattern_name' list are already defined in your environment. 85 | predictions, sorted_patterns = predict_new_data(bst, new_req_text) 86 | 87 | # Print or process the predictions and sorted pattern names as needed 88 | print("Predictions:", predictions) 89 | print("Ranked Patterns:", sorted_patterns) 90 | -------------------------------------------------------------------------------- /letor/lightgbm_ltr_train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import lightgbm as lgb 3 | import pandas as pd 4 | 5 | def parse_data(data): 6 | labels = [] 7 | features = [] 8 | qids = [] 9 | for line in data.strip().split('\n'): 10 | tokens = line.split() 11 | labels.append(float(tokens[0])) 12 | qids.append(int(tokens[1].split(':')[1])) 13 | feat_vals = {} 14 | for tok in tokens[2:]: 15 | if ':' in tok: 16 | feat, val = tok.split(':') 17 | feat_vals[int(feat)] = float(val) 18 | features.append(feat_vals) 19 | return labels, features, qids 20 | 21 | def create_dataset(features, labels, qids): 22 | num_features = max([max(feat_vals.keys()) for feat_vals in features]) 23 | data = np.array([[feat_vals.get(feat, 0) for feat in range(1, num_features+1)] for feat_vals in features]) 24 | group = np.unique(qids, return_counts=True)[1] 25 | return lgb.Dataset(data=data, label=labels, group=group, free_raw_data=False) 26 | 27 | # Train and evaluate model 28 | def train_evaluate(train_file, test_file): 29 | # Parse training data 30 | with open(train_file, "r") as file: 31 | train_labels, train_features, train_qids = parse_data(file.read()) 32 | 33 | # Parse test data 34 | with open(test_file, "r") as file: 35 | test_labels, test_features, test_qids = parse_data(file.read()) 36 | 37 | # Create datasets 38 | train_dataset = create_dataset(train_features, train_labels, train_qids) 39 | test_dataset = create_dataset(test_features, test_labels, test_qids) 40 | test_dataset.reference = train_dataset 41 | 42 | # Parameters 43 | params = { 44 | 'objective': 'lambdarank', 45 | 'metric': 'ndcg', 46 | 'ndcg_eval_at': list(range(1, 11)), 47 | 'learning_rate': 0.1, 48 | 'num_leaves': 31 49 | } 50 | num_round = 1000 51 | 52 | # Train model 53 | bst = lgb.train(params, train_dataset, num_round, valid_sets=[test_dataset], valid_names=['test']) 54 | 55 | # Make predictions 56 | max_feature_idx = max([max(feats.keys()) for feats in train_features]) 57 | test_data_matrix = np.array([[feat_vals.get(feat, 0) for feat in range(1, max_feature_idx+1)] for feat_vals in test_features]) 58 | test_preds = bst.predict(test_data_matrix) 59 | 60 | # Results 61 | results = {f"ndcg@{i+1}": bst.best_score['test'][f'ndcg@{i+1}'] for i in range(10)} 62 | 63 | return results, bst 64 | 65 | def generate_file_paths(fold_num, base_path="train_3"): 66 | """Generate file paths for training, testing, and model based on fold number.""" 67 | train_file = f"{base_path}/train_fold_{fold_num}.txt" 68 | test_file = f"{base_path}/test_fold_{fold_num}.txt" 69 | model_file = f"{base_path}/model_fold_{fold_num}.txt" 70 | return train_file, test_file, model_file 71 | 72 | def perform_5_fold_train_test(): 73 | """Perform 5-fold training and testing, saving results and models.""" 74 | results_list = [] 75 | for i in range(1, 6): 76 | train_file, test_file, model_file = generate_file_paths(i) 77 | 78 | results, bst = train_evaluate(train_file, test_file) 79 | results_list.append(results) 80 | bst.save_model(model_file) 81 | 82 | return results_list 83 | 84 | # Execute 5-fold training/testing and save results 85 | results_list = perform_5_fold_train_test() 86 | 87 | # Store results in Excel 88 | df = pd.DataFrame(results_list) 89 | results_path = "train_results.xlsx" 90 | df.to_excel(results_path, index=False) 91 | -------------------------------------------------------------------------------- /annotation-program/migrations/env.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from logging.config import fileConfig 3 | 4 | from flask import current_app 5 | 6 | from alembic import context 7 | 8 | # this is the Alembic Config object, which provides 9 | # access to the values within the .ini file in use. 10 | config = context.config 11 | 12 | # Interpret the config file for Python logging. 13 | # This line sets up loggers basically. 14 | fileConfig(config.config_file_name) 15 | logger = logging.getLogger('alembic.env') 16 | 17 | 18 | def get_engine(): 19 | try: 20 | # this works with Flask-SQLAlchemy<3 and Alchemical 21 | return current_app.extensions['migrate'].db.get_engine() 22 | except (TypeError, AttributeError): 23 | # this works with Flask-SQLAlchemy>=3 24 | return current_app.extensions['migrate'].db.engine 25 | 26 | 27 | def get_engine_url(): 28 | try: 29 | return get_engine().url.render_as_string(hide_password=False).replace( 30 | '%', '%%') 31 | except AttributeError: 32 | return str(get_engine().url).replace('%', '%%') 33 | 34 | 35 | # add your model's MetaData object here 36 | # for 'autogenerate' support 37 | # from myapp import mymodel 38 | # target_metadata = mymodel.Base.metadata 39 | config.set_main_option('sqlalchemy.url', get_engine_url()) 40 | target_db = current_app.extensions['migrate'].db 41 | 42 | # other values from the config, defined by the needs of env.py, 43 | # can be acquired: 44 | # my_important_option = config.get_main_option("my_important_option") 45 | # ... etc. 46 | 47 | 48 | def get_metadata(): 49 | if hasattr(target_db, 'metadatas'): 50 | return target_db.metadatas[None] 51 | return target_db.metadata 52 | 53 | 54 | def run_migrations_offline(): 55 | """Run migrations in 'offline' mode. 56 | 57 | This configures the context with just a URL 58 | and not an Engine, though an Engine is acceptable 59 | here as well. By skipping the Engine creation 60 | we don't even need a DBAPI to be available. 61 | 62 | Calls to context.execute() here emit the given string to the 63 | script output. 64 | 65 | """ 66 | url = config.get_main_option("sqlalchemy.url") 67 | context.configure( 68 | url=url, target_metadata=get_metadata(), literal_binds=True 69 | ) 70 | 71 | with context.begin_transaction(): 72 | context.run_migrations() 73 | 74 | 75 | def run_migrations_online(): 76 | """Run migrations in 'online' mode. 77 | 78 | In this scenario we need to create an Engine 79 | and associate a connection with the context. 80 | 81 | """ 82 | 83 | # this callback is used to prevent an auto-migration from being generated 84 | # when there are no changes to the schema 85 | # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html 86 | def process_revision_directives(context, revision, directives): 87 | if getattr(config.cmd_opts, 'autogenerate', False): 88 | script = directives[0] 89 | if script.upgrade_ops.is_empty(): 90 | directives[:] = [] 91 | logger.info('No changes in schema detected.') 92 | 93 | conf_args = current_app.extensions['migrate'].configure_args 94 | if conf_args.get("process_revision_directives") is None: 95 | conf_args["process_revision_directives"] = process_revision_directives 96 | 97 | connectable = get_engine() 98 | 99 | with connectable.connect() as connection: 100 | context.configure( 101 | connection=connection, 102 | target_metadata=get_metadata(), 103 | **conf_args 104 | ) 105 | 106 | with context.begin_transaction(): 107 | context.run_migrations() 108 | 109 | 110 | if context.is_offline_mode(): 111 | run_migrations_offline() 112 | else: 113 | run_migrations_online() 114 | -------------------------------------------------------------------------------- /recommender/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Privacy Requirement Predictor 7 | 8 | 26 | 27 | 28 | 29 |
30 |
31 | 32 | 33 |
34 | 35 | 36 | 39 | 40 |
41 | 42 |
43 |
44 | 45 | 46 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /recommender/data/patterns_new.json: -------------------------------------------------------------------------------- 1 | [{ 2 | "title": "Federated Learning", 3 | "excerpt": "Machine learning technique that trains an algorithm across multiple decentralized edge devices or servers holding local data samples, without exchanging them.", 4 | "description": "Federated learning (also known as collaborative learning) is a machine learning technique that trains an algorithm across multiple decentralized edge devices or servers holding local data samples, without exchanging them. This approach stands in contrast to traditional centralized machine learning techniques where all the local datasets are uploaded to one server, as well as to more classical decentralized approaches which often assume that local data samples are identically distributed. Federated learning enables multiple actors to build a common, robust machine learning model without sharing data, thus allowing to address critical issues such as data privacy, data security, data access rights and access to heterogeneous data. Its applications are spread over a number of industries including defense, telecommunications, IoT, and pharmaceutics. A major open question at the moment is how inferior models learned through federated data are relative to ones where the data are pooled. Another open question concerns the trustworthiness of the edge devices and the impact of malicious actors on the learned model. Federated learning aims at training a machine learning algorithm, for instance deep neural networks, on multiple local datasets contained in local nodes without explicitly exchanging data samples. The general principle consists in training local models on local data samples and exchanging parameters (e.g. the weights and biases of a deep neural network) between these local nodes at some frequency to generate a global model shared by all nodes. The main difference between federated learning and distributed learning lies in the assumptions made on the properties of the local datasets,[1] as distributed learning originally aims at parallelizing computing power where federated learning originally aims at training on heterogeneous datasets. While distributed learning also aims at training a single model on multiple servers, a common underlying assumption is that the local datasets are independent and identically distributed (i.i.d.) and roughly have the same size. None of these hypotheses are made for federated learning; instead, the datasets are typically heterogeneous and their sizes may span several orders of magnitude. Moreover, the clients involved in federated learning may be unreliable as they are subject to more failures or drop out since they commonly rely on less powerful communication media (i.e. Wi-Fi) and battery-powered systems (i.e. smartphones and IoT devices) compared to distributed learning where nodes are typically datacenters that have powerful computational capabilities and are connected to one another with fast networks. In the centralized federated learning setting, a central server is used to orchestrate the different steps of the algorithms and coordinate all the participating nodes during the learning process. The server is responsible for the nodes selection at the beginning of the training process and for the aggregation of the received model updates. Since all the selected nodes have to send updates to a single entity, the server may become a bottleneck of the system. In the decentralized federated learning setting, the nodes are able to coordinate themselves to obtain the global model. This setup prevents single point failures as the model updates are exchanged only between interconnected nodes without the orchestration of the central server. Nevertheless, the specific network topology may affect the performances of the learning process.[2] See blockchain-based federated learning[3] and the references therein. An increasing number of application domains involve a large set of heterogeneous clients, e.g., mobile phones and IoT devices.[4] Most of the existing Federated learning strategies assume that local models share the same global model architecture. Recently, a new federated learning framework named HeteroFL was developed to address heterogeneous clients equipped with very different computation and communication capabilities.[5] The HeteroFL technique can enable the training of heterogeneous local models with dynamically varying computation and non-iid data complexities while still producing a single accurate global inference model.", 5 | "source": "https://en.wikipedia.org/wiki/Federated_learning" 6 | }, 7 | { 8 | "title": "FIDO Authentication", 9 | "excerpt": "The FIDO Alliance is involved in three areas to work towards achieving its mission to reduce the world’s reliance on passwords to better secure the web: user authentication; identity verification and binding; and the Internet of Things (IoT)", 10 | "description": "The FIDO Alliance is involved in three areas to work towards achieving its mission to reduce the world’s reliance on passwords to better secure the web: user authentication; identity verification and binding; and the Internet of Things (IoT). The work areas address essential aspects of the digital identity lifecycle management including identity verification for initial account onboarding and account recovery, and user and device authentication. Passwords endure despite the growing consensus their use needs to be reduced, if not replaced. But even though effective PKI and strong authentication solutions have existed for years, barriers to widespread adoption persist. Consumers don’t like the user experience, and online service providers don’t want the cost and complexity of developing and provisioning their own dedicated solutions. The industry’s answer to the password problem. The FIDO Alliance developed FIDO Authentication standards based on public key cryptography for authentication that is more secure than passwords and SMS OTPs, simpler for consumers to use, and easier for service providers to deploy and manage. FIDO Authentication enables password-only logins to be replaced with secure and fast login experiences across websites and apps.", 11 | "source": "https://fidoalliance.org/fido-authentication/" 12 | } 13 | ] -------------------------------------------------------------------------------- /letor/compute_shap.py: -------------------------------------------------------------------------------- 1 | import shap 2 | import lightgbm as lgb 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | 7 | # Assuming 'model_fold_1.txt' is your model file and 'test_fold_1.txt' is your test data file 8 | PARENT_PATH = "resultfold/train_1/" 9 | model_path = PARENT_PATH + 'model_fold_4.txt' 10 | test_path = PARENT_PATH + 'test_fold_4.txt' 11 | 12 | 13 | def define_feature_names(): 14 | # Generate feature names for the first 4 features 15 | feature_names = [ 16 | "Covered Words", "Covered Words Ratio", 17 | "Length of Query", "IDF of Query" 18 | ] 19 | 20 | # Generate feature names for TF features (5-14) 21 | tf_feature_names = [ 22 | "TF Sum", "TF Min", "TF Max", "TF Average", "TF Variance", 23 | "Normalized TF Sum", "Normalized TF Min", "Normalized TF Max", "Normalized TF Average", "Normalized TF Variance" 24 | ] 25 | feature_names.extend(tf_feature_names) 26 | 27 | # Generate feature names for TF-IDF features (15-19) 28 | tf_idf_feature_names = [ 29 | "TF-IDF Sum", "TF-IDF Min", "TF-IDF Max", "TF-IDF Average", "TF-IDF Variance" 30 | ] 31 | feature_names.extend(tf_idf_feature_names) 32 | 33 | feature_names.extend([ 34 | "BM25", "Content Similarity #1", "Title Similarity #1", "Excerpt Similarity #1", 35 | "Content Similarity #2", "Title Similarity #2", "Excerpt Similarity #2", 36 | "Binary Query", "Multi Query", "Binary Pattern", "Multi Pattern", 37 | "Content Similarity #1.1", "Title Similarity #1.1", "Excerpt Similarity #1.1", 38 | "Content Similarity #2.1", "Title Similarity #2.1", "Excerpt Similarity #2.1" 39 | ]) 40 | 41 | # return here for train_1 42 | # return feature_names 43 | 44 | # Define the naming for hadamard product features 45 | hadamard_feature_sets = [ 46 | "Hadamard Content #1", "Hadamard Title #1", "Hadamard Excerpt #1", 47 | "Hadamard Content #2", "Hadamard Title #2", "Hadamard Excerpt #2" 48 | ] 49 | 50 | 51 | # Generate feature names for the hadamard product features 52 | for feature_set_name in hadamard_feature_sets: 53 | for i in range(1, 769): # Each set has 768 features 54 | feature_names.append(f"{feature_set_name} {i}") 55 | 56 | # return here for train_2 57 | # return feature_names 58 | 59 | # Define the naming for concatenation features 60 | concat_feature_sets = [ 61 | "Concat Content #1", "Concat Title #1", "Concat Excerpt #1", 62 | "Concat Content #2", "Concat Title #2", "Concat Excerpt #2" 63 | ] 64 | 65 | # Generate feature names for the concatenation features 66 | for feature_set_name in concat_feature_sets: 67 | for i in range(1, 769): # Each set has 768 features 68 | feature_names.append(f"{feature_set_name} {i}") 69 | 70 | # return here for train_3 71 | return feature_names 72 | 73 | 74 | feature_names = define_feature_names() 75 | 76 | # Define a function to parse the data, as it's used for both training and test data 77 | def parse_data(data): 78 | labels = [] 79 | features = [] 80 | qids = [] 81 | for line in data.strip().split('\n'): 82 | tokens = line.split() 83 | labels.append(float(tokens[0])) 84 | qids.append(int(tokens[1].split(':')[1])) 85 | feat_vals = {} 86 | for tok in tokens[2:]: 87 | if ':' in tok: 88 | feat, val = tok.split(':') 89 | feat_vals[int(feat)] = float(val) 90 | features.append(feat_vals) 91 | return labels, features, qids 92 | 93 | 94 | # Load the trained model 95 | bst = lgb.Booster(model_file=model_path) 96 | 97 | # Manually set the objective parameter if it's not present 98 | if 'objective' not in bst.params: 99 | bst.params['objective'] = 'lambdarank' 100 | 101 | # Parse test data 102 | with open(test_path, "r") as file: 103 | test_labels, test_features, test_qids = parse_data(file.read()) 104 | 105 | # Prepare test data matrix 106 | max_feature_idx = max([max(feats.keys()) for feats in test_features]) 107 | test_data_matrix = np.array([[feat_vals.get(feat, 0) for feat in range(1, max_feature_idx+1)] for feat_vals in test_features]) 108 | 109 | # Create SHAP explainer 110 | explainer = shap.TreeExplainer(bst) 111 | 112 | # Compute SHAP values 113 | shap_values = explainer.shap_values(test_data_matrix) 114 | 115 | shap_explanation = shap.Explanation(values=shap_values[0], 116 | base_values=explainer.expected_value, 117 | data=test_data_matrix[0], feature_names=feature_names) 118 | 119 | # Save the SHAP summary bar plot with the top 10 features to a file 120 | shap.summary_plot(shap_values, test_data_matrix, plot_type='bar', feature_names=feature_names, show=False, max_display=10) 121 | plt.savefig('ltr_shap_summary_bar_top10_named.png') 122 | plt.close() 123 | 124 | # Save the SHAP beeswarm plot with the top 10 features to a file 125 | shap.summary_plot(shap_values, test_data_matrix, show=False, feature_names=feature_names, max_display=10) 126 | plt.savefig('ltr_shap_summary_beeswarm_top10_named.png') 127 | plt.close() 128 | 129 | 130 | # Save the SHAP dependence plot for a specific feature (e.g., Feature 1) to a file 131 | shap.dependence_plot(21, shap_values, test_data_matrix, feature_names=feature_names, show=False) 132 | plt.savefig('ltr_shap_dependence_plot_feature_22.png') 133 | plt.close() 134 | 135 | # Save the SHAP dependence plot for a specific feature (e.g., Feature 1) to a file 136 | # shap.dependence_plot(597, shap_values, test_data_matrix, feature_names=feature_names, show=False) 137 | # plt.savefig('ltr_shap_dependence_plot_feature_597.png') 138 | # plt.close() 139 | 140 | # Save the SHAP waterfall plot for the first prediction to a file 141 | shap_waterfall_plot = plt.figure() 142 | shap.plots.waterfall(shap_explanation, max_display=10) 143 | shap_waterfall_plot.savefig('ltr_shap_waterfall_plot_named.png') 144 | plt.close(shap_waterfall_plot) 145 | 146 | 147 | -------------------------------------------------------------------------------- /letor/case_studies.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import numpy as np 3 | from rich.console import Console 4 | from rich.table import Table 5 | 6 | def dcg_at_k(r: List[int], k: int) -> float: 7 | """Discounted Cumulative Gain at rank k.""" 8 | r = np.asfarray(r)[:k] 9 | return np.sum(r / np.log2(np.arange(2, r.size + 2))) 10 | 11 | def ndcg_at_k(r: List[int], k: int) -> float: 12 | """Normalized Discounted Cumulative Gain at rank k.""" 13 | dcg_max = dcg_at_k(sorted(r, reverse=True), k) 14 | if not dcg_max: 15 | return 0. 16 | return dcg_at_k(r, k) / dcg_max 17 | 18 | def calculate_metrics(recommendations: List[List[str]], ideal: List[List[str]]) -> dict: 19 | max_k = 5 # Maximum rank for NDCG calculations 20 | ndcg_scores = {k: [] for k in range(1, max_k + 1)} 21 | ndcg_details = [] 22 | 23 | for rec, ideal_rec in zip(recommendations, ideal): 24 | # Convert to graded relevance 25 | relevance = [] 26 | for i in range(len(rec)): 27 | if rec[i] in ideal_rec: 28 | pos_diff = abs(ideal_rec.index(rec[i]) - i) 29 | relevance_score = max(5 - pos_diff, 1) 30 | else: 31 | relevance_score = 0 32 | relevance.append(relevance_score) 33 | 34 | row_ndcg = {} 35 | for k in range(1, max_k + 1): 36 | ndcg_score = ndcg_at_k(relevance, k) 37 | ndcg_scores[k].append(ndcg_score) 38 | row_ndcg[f'NDCG@{k}'] = ndcg_score 39 | ndcg_details.append(row_ndcg) 40 | 41 | metrics = { 42 | "mean_ndcg": {k: np.mean(ndcg_scores[k]) for k in ndcg_scores}, 43 | "ndcg_details": ndcg_details 44 | } 45 | 46 | return metrics 47 | 48 | def find_best_ndcg_rows(ndcg_details): 49 | best_rows = {} 50 | for k in range(1, 6): # For each NDCG rank from 1 to 5 51 | best_score = -1 52 | best_row_index = -1 53 | for i, row in enumerate(ndcg_details): 54 | if row[f'NDCG@{k}'] > best_score: 55 | best_score = row[f'NDCG@{k}'] 56 | best_row_index = i 57 | best_rows[f'best_row_for_ndcg@{k}'] = (best_row_index, best_score) 58 | return best_rows 59 | 60 | def sort_ndcg_rows(ndcg_details): 61 | sorted_rows = {} 62 | for k in range(1, 6): # For each NDCG rank from 1 to 5 63 | rows_with_scores = [(i, row[f'NDCG@{k}']) for i, row in enumerate(ndcg_details)] 64 | rows_with_scores.sort(key=lambda x: x[1], reverse=True) # Sort by NDCG score, highest first 65 | sorted_rows[f'ndcg@{k}_sorted'] = rows_with_scores 66 | return sorted_rows 67 | 68 | def display_sorted_ndcg_rows(sorted_ndcg_rows): 69 | console = Console() 70 | for k in sorted_ndcg_rows: 71 | table = Table(show_header=True, header_style="bold magenta") 72 | table.add_column("Row Index", style="dim") 73 | table.add_column(f"NDCG@{k[-1]}", justify="right") 74 | 75 | for row_index, score in sorted_ndcg_rows[k]: 76 | table.add_row(str(row_index), f"{score:.4f}") 77 | 78 | console.print(f"Sorted Rows for {k.upper()}") 79 | console.print(table) 80 | 81 | 82 | def calculate_average_ndcg_per_row(ndcg_details): 83 | average_ndcg_scores = [] 84 | for row in ndcg_details: 85 | average_score = sum(row[f'NDCG@{k}'] for k in range(1, 6)) / 5 86 | average_ndcg_scores.append(average_score) 87 | return average_ndcg_scores 88 | 89 | def display_sorted_average_ndcg_rows(average_ndcg_scores): 90 | sorted_average_scores = sorted(enumerate(average_ndcg_scores), key=lambda x: x[1], reverse=True) 91 | 92 | console = Console() 93 | table = Table(show_header=True, header_style="bold magenta") 94 | table.add_column("Row Index", style="dim") 95 | table.add_column("Average NDCG Score", justify="right") 96 | 97 | for row_index, score in sorted_average_scores: 98 | table.add_row(str(row_index), f"{score:.4f}") 99 | 100 | console.print("Rows Sorted by Average NDCG Score (Best to Worst)") 101 | console.print(table) 102 | 103 | recommendations = [ 104 | ["Attribute Based Credentials", "Psuedonymous Identity", "Location Granularity", 105 | "Decoupling content and location information visibility", "Onion Routing"], 106 | ["Psuedonymous Identity", "Attribute Based Credentials", "Obtaining Explicit Consent", 107 | "Onion Routing", "Protection against Tracking"], 108 | ["Support Selective Disclosure", "Pseudonymous Messaging", "Psuedonymous Identity", 109 | "Attribute Based Credentials", "Added-noise measurement obfuscation"], 110 | ["Attribute Based Credentials", "Active broadcast of presence", "Added-noise measurement obfuscation", 111 | "Awareness Feed", "Personal Data Store"], 112 | ["Strip Invisible Metadata", "Added-noise measurement obfuscation", "Lawful Consent", 113 | "Obtaining Explicit Consent", "Attribute Based Credentials"], 114 | ["Active broadcast of presence", "Privacy dashboard", "Attribute Based Credentials", 115 | "Abridged Terms and Conditions", "Dynamic Privacy Policy Display"], 116 | ["Lawful Consent", "Attribute Based Credentials", "Awareness Feed", 117 | "Selective access control", "Informed Implicit Consent"] 118 | ] 119 | 120 | recommendations_ideal = [ 121 | ["Location Granularity", "Decoupling content and location information visibility", "Psuedonymous Identity", "Attribute Based Credentials", "O"], 122 | ["Protection against Tracking", "Psuedonymous Identity", "Attribute Based Credentials", "Onion Routing", "O"], 123 | ["Support Selective Disclosure", "Pseudonymous Messaging", "Psuedonymous Identity", 124 | "Attribute Based Credentials", "Added-noise measurement obfuscation"], 125 | ["Attribute Based Credentials", "Added-noise measurement obfuscation", "Active broadcast of presence", "Awareness Feed", "Personal Data Store"], 126 | ["Strip Invisible Metadata", "Added-noise measurement obfuscation", "O", 127 | "O", "Attribute Based Credentials"], 128 | ["Active broadcast of presence", "Abridged Terms and Conditions", "Privacy dashboard", "Dynamic Privacy Policy Display", "O"], 129 | ["Lawful Consent", "Awareness Feed", 130 | "Informed Implicit Consent", "Selective access control", "O"] 131 | ] 132 | 133 | # Example usage 134 | metrics = calculate_metrics(recommendations, recommendations_ideal) 135 | sorted_ndcg_rows = sort_ndcg_rows(metrics['ndcg_details']) 136 | best_ndcg_rows = find_best_ndcg_rows(metrics['ndcg_details']) 137 | 138 | print("Metrics:", metrics) 139 | print("==="*5) 140 | print("Best NDCG rows:", best_ndcg_rows) 141 | print("==="*5) 142 | display_sorted_ndcg_rows(sorted_ndcg_rows) 143 | 144 | average_ndcg_scores = calculate_average_ndcg_per_row(metrics['ndcg_details']) 145 | 146 | # Print using rich console 147 | display_sorted_average_ndcg_rows(average_ndcg_scores) -------------------------------------------------------------------------------- /recommender/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request, jsonify, url_for 2 | import json 3 | import numpy as np 4 | import lightgbm as lgb 5 | from feature_creation import PrivacyPatternFeatures 6 | import os 7 | import shap 8 | import matplotlib.pyplot as plt 9 | 10 | app = Flask(__name__) 11 | pp = PrivacyPatternFeatures() 12 | 13 | # Load the patterns and the trained model 14 | with open("data/patterns.json", 'r') as p: 15 | patterns = json.load(p) 16 | 17 | pattern_name = [pattern["title"].replace(".md", "") for pattern in patterns] 18 | 19 | # Load any additional patterns 20 | with open("data/patterns_new.json", 'r') as p: 21 | patterns_new = json.load(p) 22 | 23 | for p_new in patterns_new: 24 | patterns.append(p_new) 25 | pattern_name.append(p_new["title"]) 26 | 27 | model_file_path = "LTR_resources/model_fold_4_train_3.txt" # the path to LeToR trained model on LightGBM 28 | if os.path.exists(model_file_path): 29 | bst = lgb.Booster(model_file=model_file_path) 30 | bst.params['objective'] = 'lambdarank' 31 | else: 32 | raise FileNotFoundError("Model file not found.") 33 | 34 | @app.route('/') 35 | def index(): 36 | return render_template('index.html') 37 | 38 | @app.route('/predict', methods=['POST']) 39 | def predict(): 40 | new_req_text = request.form['requirement'] 41 | feature_vectors = process_new_requirement(new_req_text) 42 | data_matrix = np.array(feature_vectors) 43 | 44 | # Get sorted patterns with SHAP plots 45 | sorted_patterns_with_shap = predict_new_data(bst, data_matrix) 46 | 47 | # Get indices of sorted patterns from original pattern list 48 | sorted_pattern_indices = [pattern_name.index(pattern[0]) for pattern in sorted_patterns_with_shap] 49 | 50 | # Use these indices to get the correct excerpts and the SHAP image paths 51 | patterns_with_desc_and_shap = [ 52 | { 53 | "pattern": sorted_patterns_with_shap[i][0], 54 | "excerpt": patterns[sorted_pattern_indices[i]]["excerpt"], 55 | "shap_plot_path": sorted_patterns_with_shap[i][2], # SHAP plot image path 56 | "shap_waterfall_plot_path": sorted_patterns_with_shap[i][3] # SHAP plot image path 57 | } 58 | for i in range(len(sorted_patterns_with_shap)) 59 | ] 60 | 61 | # Return JSON response with paths to SHAP plot images 62 | return jsonify(sorted_patterns=patterns_with_desc_and_shap) 63 | 64 | def get_feature_names(): 65 | # Generate feature names for the first 4 features 66 | feature_names = [ 67 | "Covered Words", "Covered Words Ratio", 68 | "Length of Query", "IDF of Query" 69 | ] 70 | 71 | # Generate feature names for TF features (5-14) 72 | tf_feature_names = [ 73 | "TF Sum", "TF Min", "TF Max", "TF Average", "TF Variance", 74 | "Normalized TF Sum", "Normalized TF Min", "Normalized TF Max", "Normalized TF Average", "Normalized TF Variance" 75 | ] 76 | feature_names.extend(tf_feature_names) 77 | 78 | # Generate feature names for TF-IDF features (15-19) 79 | tf_idf_feature_names = [ 80 | "TF-IDF Sum", "TF-IDF Min", "TF-IDF Max", "TF-IDF Average", "TF-IDF Variance" 81 | ] 82 | feature_names.extend(tf_idf_feature_names) 83 | 84 | feature_names.extend([ 85 | "BM25", "Content Similarity #1", "Title Similarity #1", "Excerpt Similarity #1", 86 | "Content Similarity #2", "Title Similarity #2", "Excerpt Similarity #2", 87 | "Binary Query", "Multi Query", "Binary Pattern", "Multi Pattern", 88 | "Content Similarity #1.1", "Title Similarity #1.1", "Excerpt Similarity #1.1", 89 | "Content Similarity #2.1", "Title Similarity #2.1", "Excerpt Similarity #2.1" 90 | ]) 91 | 92 | # return here for train_1 93 | # return feature_names 94 | 95 | # Define the naming for hadamard product features 96 | hadamard_feature_sets = [ 97 | "Hadamard Content #1", "Hadamard Title #1", "Hadamard Excerpt #1", 98 | "Hadamard Content #2", "Hadamard Title #2", "Hadamard Excerpt #2" 99 | ] 100 | 101 | 102 | # Generate feature names for the hadamard product features 103 | for feature_set_name in hadamard_feature_sets: 104 | for i in range(1, 769): # Each set has 768 features 105 | feature_names.append(f"{feature_set_name} {i}") 106 | 107 | # return here for train_2 108 | # return feature_names 109 | 110 | # Define the naming for concatenation features 111 | concat_feature_sets = [ 112 | "Concat Content #1", "Concat Title #1", "Concat Excerpt #1", 113 | "Concat Content #2", "Concat Title #2", "Concat Excerpt #2" 114 | ] 115 | 116 | # Generate feature names for the concatenation features 117 | for feature_set_name in concat_feature_sets: 118 | for i in range(1, 769): # Each set has 768 features 119 | feature_names.append(f"{feature_set_name} {i}") 120 | 121 | # return here for train_3 122 | return feature_names 123 | 124 | def process_new_requirement(new_req_text): 125 | """ 126 | Process a new privacy requirement text to create a feature vector. 127 | 128 | Parameters: 129 | - new_req_text (str): The new privacy requirement text. 130 | - patterns (list): The list of patterns (already loaded from the patterns file). 131 | - pp (PrivacyPatternFeatures): The PrivacyPatternFeatures instance. 132 | - pattern_name (list): List of pattern names extracted from the patterns file. 133 | 134 | Returns: 135 | - A list of feature vectors for the new requirement text. 136 | """ 137 | features = pp.construct_features(new_req_text) 138 | feature_vectors = [] 139 | 140 | for idx, pattern in enumerate(patterns): 141 | feature_vector = features[idx] # Assuming features[idx] is already a list of features 142 | feature_vectors.append(feature_vector) 143 | 144 | return feature_vectors 145 | 146 | def predict_new_data(model, feature_vectors): 147 | print(len(feature_vectors)) 148 | # Make predictions using the model 149 | predictions = model.predict(feature_vectors) 150 | 151 | 152 | # Rank the pattern names based on predictions and select only the top 7 153 | sorted_indices = np.argsort(predictions)[::-1][:7] 154 | top_sorted_patterns = [pattern_name[i] for i in sorted_indices] 155 | 156 | # Generate SHAP values 157 | explainer = shap.TreeExplainer(model) 158 | shap_values = explainer.shap_values(feature_vectors) 159 | 160 | # Generate SHAP force plot images for each prediction 161 | shap_image_paths = [] 162 | shap_waterfall_image_paths = [] 163 | for i in sorted_indices: 164 | print(shap_values[i]) 165 | print(feature_vectors[i]) 166 | 167 | # FORCE PLOT 168 | plt.figure() 169 | shap.force_plot( 170 | explainer.expected_value, shap_values[i], feature_vectors[i], 171 | feature_names=get_feature_names(), matplotlib=True, show=False 172 | ) 173 | image_url = url_for('static', filename=f'shap_plots/pattern_{i}.png') 174 | image_path = f"static/shap_plots/pattern_{i}.png" 175 | plt.savefig(image_path) 176 | shap_image_paths.append(image_url) 177 | plt.close() 178 | 179 | # WATERFALL PLOT 180 | plt.figure() 181 | plt.tight_layout() 182 | # Create an Explanation object 183 | shap_explanation = shap.Explanation( 184 | values=shap_values[i], 185 | base_values=explainer.expected_value, 186 | data=feature_vectors[i], 187 | feature_names=get_feature_names() 188 | ) 189 | # Generate a waterfall plot for the i-th prediction 190 | shap.plots.waterfall(shap_explanation, max_display=14, show=False) 191 | waterfall_image_path = f"static/shap_plots/waterfall_pattern_{i}.png" 192 | plt.savefig(waterfall_image_path, bbox_inches='tight') 193 | shap_waterfall_image_paths.append(url_for('static', filename=f"shap_plots/waterfall_pattern_{i}.png")) 194 | plt.close() 195 | 196 | # Combine predictions, pattern names, and their corresponding SHAP values 197 | top_patterns_with_shap = [ 198 | (pattern_name[i], predictions[i], shap_image_paths[j], shap_waterfall_image_paths[j]) 199 | for j, i in enumerate(sorted_indices) 200 | ] 201 | 202 | return top_patterns_with_shap 203 | 204 | if __name__ == '__main__': 205 | app.run(debug=True) 206 | -------------------------------------------------------------------------------- /annotation-program/templates/annotate.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | {% block title %} 4 | Choose the Candidates 5 | {% endblock %} 6 | 7 | {% block content %} 8 | 57 |
58 |
59 |
63 | {{ progress|round(2) }}% 64 |
65 |
66 | 67 | {% if similar_inquery_data %} 68 | 71 | {% endif %} 72 | 73 | 74 | 75 |
76 |
77 | 78 | 79 |

#{{ query.id }}

80 |

{{ query.text }}

81 | tags: 82 | {% for tag in query.tags %} 83 | {{ tag.name }}{% if not loop.last %}, {% endif %} 84 | {% endfor %} 85 |
86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | {% for candidate in recommended_candidates %} 96 | 97 | 107 | 114 | 115 | {% endfor %} 116 | 117 | 118 | {% for candidate in other_candidates %} 119 | 120 | 130 | 137 | 138 | {% endfor %} 139 | 140 |
Privacy Design PatternRelevance
98 |

{{ candidate.candidate.text }}

99 |

{{ candidate.candidate.description }}...more

100 |
101 | tags: 102 | {% for tag in candidate.candidate.tags %} 103 | {{ tag.name }}{% if not loop.last %}, {% endif %} 104 | {% endfor %} 105 | 106 |
108 |
109 | {% for i in range(5, 0, -1) %} 110 | 111 | {% endfor %} 112 |
113 |
121 |

{{ candidate.text }}

122 |

{{ candidate.description }}...more

123 |
124 | tags: 125 | {% for tag in candidate.tags %} 126 | {{ tag.name }}{% if not loop.last %}, {% endif %} 127 | {% endfor %} 128 | 129 |
131 |
132 | {% for i in range(5, 0, -1) %} 133 | 134 | {% endfor %} 135 |
136 |
141 | 142 | 143 |
144 |
145 | 146 | 147 | {% if similar_inquery_data %} 148 | 203 | 204 | 205 | {% endif %} 206 | 207 | 208 | {% endblock %} 209 | -------------------------------------------------------------------------------- /classification/sklearn_classifier.py: -------------------------------------------------------------------------------- 1 | import json, pickle 2 | 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | from sklearn.naive_bayes import MultinomialNB 5 | from sklearn.model_selection import cross_val_score 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.feature_extraction.text import CountVectorizer 8 | from sklearn.model_selection import StratifiedKFold 9 | 10 | import numpy as np 11 | 12 | import json 13 | 14 | from text_preprocessing import preprocess_text 15 | from text_preprocessing import to_lower, remove_stopword, lemmatize_word 16 | 17 | preprocess_functions = [to_lower, remove_stopword, lemmatize_word] 18 | 19 | privacy_objectives = { 20 | "anonymity" : ["Protection-against-tracking", "Location-granularity", "Pseudonymous-messaging", "Onion-routing", "Anonymous-reputation-based-blacklisting", "Attribute-based-credentials", "Anonymity-set"], 21 | 22 | "unlinkability" : ["Protection-against-tracking", "Location-granularity", "Pseudonymous-messaging", "Onion-routing", "Anonymous-reputation-based-blacklisting", "Attribute-based-credentials", "Decoupling-[content]-and-location-information-visibility","Active-broadcast-of-presence","Trustworthy-privacy-plugin"], 23 | 24 | "confidentiality" : ["Informed-Secure-Passwords", "Encryption-user-managed-keys", "Personal-data-store", "Aggregation-gateway", "Single-Point-of-Contact", "User-data-confinement-pattern", "Selective-Access-Control", "Buddy-List", "Added-noise-measurement-obfuscation", "Trustworthy-privacy-plugin", "Support-Selective-Disclosure", "Private-link", "Active-broadcast-of-presence", "Unusual-activities"], 25 | 26 | "plausible_deniability" : ["Location-granularity", "Use-of-dummies", "Onion-routing", "Pseudonymous-identity", "Added-noise-measurement-obfuscation", "Attribute-based-credentials", "Anonymity-set"], 27 | 28 | "undetectability" : ["Location-granularity", "Use-of-dummies", "Aggregation-gateway", "Trustworthy-privacy-plugin", "Active-broadcast-of-presence"], 29 | 30 | "manageability" : ["Federated-privacy-impact-assessment", "Data-breach-notification-pattern", "Trust-Evaluation-of-Services-Sides", "Sign-an-Agreement-to-Solve-Lack-of-Trust-on-the-Use-of-Private-Data-Context", "Obligation-management", "Privacy-Aware-Wording", "Sticky-policy"], 31 | 32 | "intervenability" : ["Minimal-Information-Asymmetry", "Informed-Secure-Passwords", "Awareness-Feed", "Encryption-user-managed-keys", "Whos-Listening", "Discouraging-blanket-strategies", "Outsourcing-[with-consent]", "Personal-data-store", "Single-Point-of-Contact", "Enable-Disable-Functions", "Obtaining-Explicit-Consent", "Decoupling-[content]-and-location-information-visibility", "Selective-Access-Control", "Informed-Credential-Selection", "Reasonable-Level-of-Control", "Masquerade", "Buddy-List", "Lawful-Consent", "Sticky-policy", "Personal-Data-Table", "Informed-Consent-for-Web-based-Transactions", "Support-Selective-Disclosure", "Private-link", "Active-broadcast-of-presence"], 33 | 34 | "transparency" : ["Minimal-Information-Asymmetry", "Informed-Secure-Passwords", "Awareness-Feed", "Whos-Listening", "Privacy-Policy-Display", "Layered-policy-design", "Asynchronous-notice", "Abridged-Terms-and-Conditions", "Policy-matching-display", "Ambient-notice", "Dynamic-Privacy-Policy-Display", "Privacy-Labels", "Data-breach-notification-pattern", "Trust-Evaluation-of-Services-Sides", "Appropriate-Privacy-Icons", "Privacy-aware-network-client", "Informed-Implicit-Consent", "Privacy-color-coding", "Icons-for-Privacy-Policies", "Obtaining-Explicit-Consent", "Privacy-Mirrors", "Appropriate-Privacy-Feedback", "Impactful-Information-and-Feedback", "Platform-for-Privacy-Preferences", "Privacy-dashboard", "Preventing-Mistakes-or-Reducing-Their-Impact", "Informed-Credential-Selection", "Privacy-Awareness-Panel", "Lawful-Consent", "Privacy-Aware-Wording", "Sticky-policy", "Personal-Data-Table", "Informed-Consent-for-Web-based-Transactions", "Increasing-Awareness-of-Information-Aggregation", "Unusual-activities"], 35 | } 36 | 37 | hard_goal = ["unlinkability","anonymity","pseudonym","undetectability","confidentiality","plausible_deniability"] 38 | soft_goal = ["transparency","intervenability","content_awareness"] 39 | skip_goal = ["availability", "integrity"] 40 | 41 | hard_pattern = [] 42 | soft_pattern = [] 43 | 44 | for g in hard_goal: 45 | if g not in privacy_objectives: 46 | continue 47 | 48 | for o in privacy_objectives[g]: 49 | hard_pattern.append(o) 50 | 51 | for g in soft_goal: 52 | if g not in privacy_objectives: 53 | continue 54 | 55 | for o in privacy_objectives[g]: 56 | soft_pattern.append(o) 57 | 58 | def get_data(multiclass=False): 59 | req_path = "../data/requirements.json" 60 | 61 | with open(req_path, 'r') as p: 62 | requirements = json.loads(p.read()) 63 | 64 | text, label = [], [] 65 | for r in requirements["rows"]: 66 | text.append(preprocess_text(r["req_text"], preprocess_functions)) 67 | lbl = r["req_type"].replace("_3","").replace("_2","").replace("_1","") 68 | 69 | if multiclass: 70 | label.append(lbl) 71 | else: 72 | label.append(1 if lbl in hard_goal else 0) 73 | 74 | return text, label 75 | 76 | def append_text_label(filepath): 77 | text, label, label_unique = [], [], [] 78 | 79 | # MAKE LABEL AS INDEX 80 | with open(filepath,"r",encoding='utf-8') as dd: 81 | for d in dd: 82 | if len(d)<=8: 83 | continue 84 | 85 | l = d.split()[0].replace("__label__","") 86 | 87 | if l in skip_goal: 88 | continue 89 | 90 | label_unique.append(l) 91 | 92 | label_unique = list(set(label_unique)) 93 | print(label_unique) 94 | 95 | with open(filepath,"r") as dd: 96 | for d in dd: 97 | if len(d)<=8: 98 | continue 99 | 100 | l = d.split()[0].replace("__label__","") 101 | 102 | if l in skip_goal: 103 | continue 104 | 105 | label.append(label_unique.index(l)) 106 | text.append(" ".join(d.split()[1:])) 107 | 108 | # if d[9].strip() in ['0','1']: 109 | # label.append(d[9].strip()) 110 | # text.append(d[11:].strip()) 111 | 112 | return text, label 113 | 114 | 115 | def get_data_from_file(with_aug=True, combine_test=True, binary="binary"): 116 | text, label = [], [] 117 | 118 | if with_aug: 119 | text_temp, label_temp = append_text_label("data/privacy_{}_data_train_aug.txt".format(binary)) 120 | text.extend(text_temp) 121 | label.extend(label_temp) 122 | 123 | else: 124 | text_temp, label_temp = append_text_label("data/privacy_{}_data_train.txt".format(binary)) 125 | text.extend(text_temp) 126 | label.extend(label_temp) 127 | 128 | if combine_test: 129 | text_temp, label_temp = append_text_label("data/privacy_{}_data_test.txt".format(binary)) 130 | text.extend(text_temp) 131 | label.extend(label_temp) 132 | 133 | return text, label 134 | 135 | def test_cross_val(): 136 | text, label = get_data_from_file(with_aug=False, combine_test=True, binary="multi") 137 | 138 | count_vect = CountVectorizer(analyzer="word", ngram_range=(1,1)) 139 | train_data = count_vect.fit_transform(text) 140 | 141 | 142 | clf1 = MultinomialNB() 143 | 144 | print ("Naive Bayes:", np.mean(cross_val_score(clf1, train_data, label, scoring='f1_macro',cv=5))) 145 | 146 | def make_classifier(): 147 | text, label = get_data_from_file(with_aug=True, combine_test=False, binary="multi") 148 | 149 | count_vect = CountVectorizer(analyzer="word", ngram_range=(1,1)) 150 | train_data = count_vect.fit_transform(text) 151 | 152 | filename = 'multi_vectorizer_model.sav' 153 | pickle.dump(count_vect, open(filename, 'wb')) 154 | 155 | clf1 = MultinomialNB() 156 | clf1.fit(train_data, label) 157 | 158 | filename = 'multi_nb_model.sav' 159 | pickle.dump(clf1, open(filename, 'wb')) 160 | 161 | def predict_class(texts, model, vectorizer): 162 | loaded_model = pickle.load(open(model, 'rb')) 163 | loaded_vect = pickle.load(open(vectorizer, 'rb')) 164 | 165 | text = [preprocess_text(t, preprocess_functions) for t in texts] 166 | v_text = loaded_vect.transform(text) 167 | 168 | prediction = loaded_model.predict(v_text) 169 | 170 | return prediction 171 | 172 | def test_classifier(): 173 | test_data = "../data/sec_compass.json" 174 | 175 | with open(test_data, 'r', encoding="utf-8-sig") as p: 176 | requirements = json.loads(p.read()) 177 | 178 | req_type = [r["req_type"] for r in requirements["rows"]] 179 | texts = [r["req_text"] for r in requirements["rows"]] 180 | prediction = predict_class(texts,'nb_model.sav','vectorizer_model.sav') 181 | 182 | for i,p in enumerate(prediction): 183 | print(req_type[i], p) 184 | 185 | def test_classifier_on_pattern(): 186 | pattern_file = "../data/patterns.json" 187 | 188 | with open(pattern_file, 'r') as p: 189 | patterns = json.loads(p.read()) 190 | 191 | texts = [] 192 | 193 | for pattern in patterns: 194 | pattern_text = [] 195 | 196 | pattern_text.append(pattern["excerpt"]) 197 | 198 | for heading in pattern["heading"]: 199 | pattern_text.append(heading["content"].strip()) 200 | 201 | texts.append(". ".join(pattern_text)) 202 | 203 | # BINARY CLASS 204 | prediction = predict_class(texts,"binary_nb_model.sav","binary_vectorizer_model.sav") 205 | 206 | for i,p in enumerate(prediction): 207 | print(patterns[i]["title"], p) 208 | 209 | print("=="*20) 210 | 211 | # MULTI CLASS 212 | prediction = predict_class(texts,"multi_nb_model.sav","multi_vectorizer_model.sav") 213 | 214 | for i,p in enumerate(prediction): 215 | print(patterns[i]["title"], p) 216 | 217 | 218 | 219 | def test_classifier_flair(): 220 | from flair.data import Sentence 221 | from flair.models import TextClassifier 222 | 223 | # only for binary 224 | # after testing this model, I dont know why it shows all 1 225 | model = TextClassifier.load('binary/glove-roberta-best-model.pt') # create example sentence 226 | 227 | test_data = "../data/sec_compass.json" 228 | 229 | with open(test_data, 'r', encoding="utf-8-sig") as p: 230 | reqs = json.loads(p.read()) 231 | 232 | for r in reqs["rows"]: 233 | sentence = Sentence(r["req_text"]) 234 | model.predict(sentence) 235 | print("True Type:",r["req_type"],r["req_type"]) 236 | # print(sentence.labels) 237 | print(sentence.labels[0].value, sentence.labels[0].score) 238 | print() 239 | 240 | def reduce_test_data(): 241 | test_data = "../data/propan_patterns_requirements.json" 242 | 243 | with open(test_data, 'r', encoding="utf-8-sig") as p: 244 | requirements = json.loads(p.read()) 245 | 246 | texts = [r["req_text"] for r in requirements] 247 | 248 | prediction = predict_class(texts) 249 | 250 | new_test_data = [] 251 | for i, label in enumerate(prediction): 252 | reduced_pattern = [] 253 | goal = soft_pattern 254 | if label == "1": 255 | goal = hard_pattern 256 | 257 | print(goal) 258 | 259 | for p in requirements[i]["pattern"]: 260 | if p["name"] in goal: 261 | reduced_pattern.append(p) 262 | 263 | new_test_data.append({"id":requirements[i]["id"],"req_text":requirements[i]["req_text"],"pattern":reduced_pattern}) 264 | 265 | with open("../data/reduced_propan_patterns_requirements.json", "w") as outfile: 266 | json.dump(new_test_data, outfile, indent=3) 267 | 268 | 269 | def classify_new_data(): 270 | json_file_path = 'all_inqueries.json' 271 | 272 | with open(json_file_path, 'r') as json_file: 273 | data = json.load(json_file) 274 | 275 | texts = [d["req_text"] for d in data] 276 | 277 | prediction = predict_class(texts, "model/binary_nb_model.sav", "model/binary_vectorizer_model.sav") 278 | 279 | # Return data and prediction labels 280 | return data, prediction 281 | 282 | def split_dataset_into_5fold(): 283 | data, labels = classify_new_data() 284 | 285 | # Create a dictionary to hold data grouped by labels 286 | grouped_data = { 287 | 0: [], 288 | 1: [] 289 | } 290 | 291 | for d, label in zip(data, labels): 292 | grouped_data[int(label)].append(d) 293 | 294 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) 295 | 296 | folds = [] 297 | 298 | for train_idx, test_idx in skf.split(data, labels): 299 | train_data = [data[i] for i in train_idx] 300 | test_data = [data[i] for i in test_idx] 301 | 302 | train_labels = [labels[i] for i in train_idx] 303 | test_labels = [labels[i] for i in test_idx] 304 | 305 | folds.append((train_data, test_data, train_labels, test_labels)) 306 | 307 | return folds 308 | 309 | def split_dataset_uniformly(): 310 | data, labels = classify_new_data() 311 | 312 | # Create a dictionary to hold data grouped by labels 313 | grouped_data = { 314 | 0: [], 315 | 1: [] 316 | } 317 | 318 | for d, label in zip(data, labels): 319 | grouped_data[int(label)].append(d) 320 | 321 | # Split each group into train, dev, and test sets 322 | train, test = [], [] 323 | for label, items in grouped_data.items(): 324 | train_set, test_set = train_test_split(items, test_size=0.2, random_state=42) # Splitting 80% train, 20% test 325 | 326 | train.extend(train_set) 327 | test.extend(test_set) 328 | 329 | return train, test 330 | 331 | def export_to_json(data, filename): 332 | """Export data to a JSON file.""" 333 | with open(filename, 'w') as json_file: 334 | json.dump(data, json_file, indent=4) 335 | 336 | folds = split_dataset_into_5fold() 337 | 338 | for fold_num, (train_data, test_data, train_labels, test_labels) in enumerate(folds, 1): 339 | # Exporting train data for each fold to JSON 340 | train_filename = f'train_patterns_req_v2_fold_{fold_num}.json' 341 | export_to_json(train_data, train_filename) 342 | 343 | # Exporting test data for each fold to JSON 344 | test_filename = f'test_patterns_req_v2_fold_{fold_num}.json' 345 | export_to_json(test_data, test_filename) 346 | 347 | 348 | # test_cross_val() 349 | # make_classifier() 350 | # test_classifier() 351 | 352 | # test_classifier_on_pattern() 353 | 354 | # reduce_test_data() 355 | -------------------------------------------------------------------------------- /annotation-program/app.py: -------------------------------------------------------------------------------- 1 | from flask_sqlalchemy import SQLAlchemy 2 | from flask import Flask, render_template, request, redirect, url_for, flash, jsonify, Response 3 | from flask_bootstrap import Bootstrap 4 | from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, UserMixin, current_user 5 | from sqlalchemy.orm import relationship 6 | from sqlalchemy.sql import func 7 | from sqlalchemy import desc 8 | from flask_migrate import Migrate 9 | from tqdm import tqdm 10 | import numpy as np 11 | from werkzeug.wrappers import Response as ResponseBase 12 | 13 | # from sentence_transformers import SentenceTransformer 14 | 15 | # Initialize the sentence-transformers model 16 | # model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2') 17 | 18 | app = Flask(__name__) 19 | app.config["SQLALCHEMY_DATABASE_URI"] = 'sqlite:///ltr_annotation.db' 20 | app.config['SECRET_KEY'] = 'klaf9897fwehkwe' # Replace this with a real secret key 21 | 22 | Bootstrap(app) 23 | db = SQLAlchemy(app) 24 | 25 | 26 | migrate = Migrate(app, db) 27 | 28 | login_manager = LoginManager() 29 | login_manager.init_app(app) 30 | login_manager.login_view = 'login' 31 | 32 | class User(UserMixin, db.Model): 33 | id = db.Column(db.Integer, primary_key=True) 34 | username = db.Column(db.String(80), unique=True, nullable=False) 35 | password = db.Column(db.String(120), nullable=False) 36 | annotations = relationship("Annotation", back_populates="user") 37 | 38 | class Tag(db.Model): 39 | id = db.Column(db.Integer, primary_key=True) 40 | name = db.Column(db.String(100), nullable=False) 41 | 42 | query_tags = db.Table('query_tags', 43 | db.Column('tag_id', db.Integer, db.ForeignKey('tag.id'), primary_key=True), 44 | db.Column('query_id', db.Integer, db.ForeignKey('inquery.id'), primary_key=True) 45 | ) 46 | 47 | candidate_tags = db.Table('candidate_tags', 48 | db.Column('tag_id', db.Integer, db.ForeignKey('tag.id'), primary_key=True), 49 | db.Column('candidate_id', db.Integer, db.ForeignKey('candidate.id'), primary_key=True) 50 | ) 51 | 52 | class Inquery(db.Model): 53 | id = db.Column(db.Integer, primary_key=True) 54 | text = db.Column(db.String(200), nullable=False) 55 | source = db.Column(db.String(200), nullable=True) 56 | tags = db.relationship('Tag', secondary=query_tags, lazy='subquery', 57 | backref=db.backref('inqueries', lazy=True)) 58 | 59 | 60 | class Candidate(db.Model): 61 | id = db.Column(db.Integer, primary_key=True) 62 | text = db.Column(db.String(200), nullable=False) 63 | description = db.Column(db.String(1000), nullable=False) 64 | source = db.Column(db.String(200), nullable=True) 65 | tags = db.relationship('Tag', secondary=candidate_tags, lazy='subquery', 66 | backref=db.backref('candidates', lazy=True)) 67 | 68 | class CandidatesRecommended(db.Model): 69 | id = db.Column(db.Integer, primary_key=True) 70 | query_id = db.Column(db.Integer, db.ForeignKey('inquery.id'), nullable=False) 71 | candidate_id = db.Column(db.Integer, db.ForeignKey('candidate.id'), nullable=False) 72 | relevance = db.Column(db.Float, nullable=False) 73 | query_data = relationship("Inquery", backref="recommended_candidates") 74 | candidate = relationship("Candidate") 75 | 76 | class Annotation(db.Model): 77 | id = db.Column(db.Integer, primary_key=True) 78 | user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False) 79 | query_id = db.Column(db.Integer, db.ForeignKey('inquery.id'), nullable=False) 80 | candidate_id = db.Column(db.Integer, db.ForeignKey('candidate.id'), nullable=False) 81 | rank = db.Column(db.Integer, nullable=False) 82 | relevance = db.Column(db.Float, nullable=False) 83 | timestamp = db.Column(db.DateTime(timezone=True), server_default=func.now()) 84 | user = relationship("User", back_populates="annotations") 85 | query_data = relationship("Inquery") 86 | candidate = relationship("Candidate") 87 | 88 | class Similarity(db.Model): 89 | id = db.Column(db.Integer, primary_key=True) 90 | inquery_id = db.Column(db.Integer, db.ForeignKey('inquery.id'), nullable=False) 91 | other_inquery_id = db.Column(db.Integer, nullable=False) 92 | score = db.Column(db.Float, nullable=False) 93 | 94 | @login_manager.user_loader 95 | def load_user(user_id): 96 | return User.query.get(int(user_id)) 97 | 98 | @app.route('/') 99 | @login_required 100 | def home(): 101 | total_queries_by_source = db.session.query(Inquery.source, func.count(Inquery.id)).group_by(Inquery.source).all() 102 | 103 | # Create a subquery that groups by query_id and only select query_id 104 | subquery = db.session.query(Annotation.query_id).filter(Annotation.user_id==current_user.id).group_by(Annotation.query_id).subquery() 105 | 106 | # Now group the subquery result by source 107 | total_queries_done_by_user = db.session.query(Inquery.source, func.count(subquery.c.query_id)).join(subquery, Inquery.id == subquery.c.query_id).group_by(Inquery.source).all() 108 | 109 | total_queries = {source: count for source, count in total_queries_by_source} 110 | queries_done = {source: count for source, count in total_queries_done_by_user} 111 | 112 | # This creates a list of dictionaries, where each dictionary contains the source, total queries, and queries done by the user 113 | queries_data = [{'source': source, 'total': total_queries[source], 'done': queries_done.get(source, 0)} for source in total_queries] 114 | 115 | return render_template('home.html', annotations_data=queries_data) 116 | 117 | @app.route('/tag/') 118 | def tag(tag_name): 119 | # Inquery the database to get all candidates with this tag 120 | tag = Tag.query.filter_by(name=tag_name).first_or_404() 121 | candidates = tag.candidates # assuming a backref in your Tag model 122 | return render_template('tag.html', tag=tag, candidates=candidates) 123 | 124 | @app.route('/login', methods=['GET', 'POST']) 125 | def login(): 126 | if request.method == 'POST': 127 | user = User.query.filter_by(username=request.form['username']).first() 128 | if user and user.password == request.form['password']: # You should use hashed passwords in a real application 129 | login_user(user) 130 | return redirect(url_for('home')) 131 | return render_template('login.html') 132 | 133 | @app.route('/register', methods=['GET', 'POST']) 134 | def register(): 135 | if request.method == 'POST': 136 | new_user = User(username=request.form['username'], password=request.form['password']) 137 | db.session.add(new_user) 138 | db.session.commit() 139 | return redirect(url_for('login')) 140 | return render_template('register.html') 141 | 142 | @app.route('/logout') 143 | @login_required 144 | def logout(): 145 | logout_user() 146 | return redirect(url_for('login')) 147 | 148 | def calculate_tag_overlap(query_tags, other_query_tags): 149 | return len(set(query_tags) & set(other_query_tags)) 150 | 151 | @app.route('/apply_annotation', methods=['POST']) 152 | @login_required 153 | def apply_annotation(): 154 | # Fetch the current user's id 155 | user_id = current_user.id 156 | 157 | # Extract the query_id from the pre-annotated data and current_query_id from the form data 158 | previous_query_id = request.form.get("previous_query_id") 159 | current_query_id = request.form.get("current_query_id") 160 | 161 | sql = """ 162 | INSERT INTO annotation (user_id, query_id, candidate_id, rank, relevance, timestamp) 163 | SELECT :user_id, :current_query_id, candidate_id, rank, relevance, datetime('now') 164 | FROM annotation 165 | WHERE query_id = :previous_query_id; 166 | """ 167 | 168 | params = { 169 | "user_id": user_id, 170 | "current_query_id": current_query_id, 171 | "previous_query_id": previous_query_id 172 | } 173 | 174 | db.session.execute(sql, params) 175 | db.session.commit() 176 | 177 | 178 | return redirect(url_for('annotate', source=request.form.get("source"))) 179 | 180 | 181 | @app.route('/annotate/', methods=['GET', 'POST']) 182 | @login_required 183 | def annotate(source): 184 | if request.method == 'POST': 185 | _save_annotations(request.form, current_user.id) 186 | 187 | return redirect(url_for('annotate', source=source)) 188 | 189 | else: 190 | query = _get_unannotated_query_for_user(current_user.id, source) 191 | if not query: 192 | flash("All queries have been annotated. Thank you for your contribution!", "info") 193 | return redirect(url_for('home')) 194 | 195 | progress = _calculate_annotation_progress(source, current_user.id) 196 | similar_inquery_data = get_recommended_queries_by_semantic_similarity(query) 197 | recommended_candidates, other_candidates = _get_candidates(query, current_user.id) 198 | 199 | return render_template('annotate.html', query=query, similar_inquery_data=similar_inquery_data, 200 | recommended_candidates=recommended_candidates, other_candidates=other_candidates, 201 | progress=progress) 202 | 203 | def _save_annotations(form_data, user_id): 204 | for candidate_id, relevance in form_data.items(): 205 | if candidate_id == 'query_id': # ignore the query text field 206 | continue 207 | annotation = Annotation(user_id=user_id, query_id=form_data["query_id"], candidate_id=candidate_id, 208 | relevance=float(relevance), rank=int(relevance)) 209 | db.session.add(annotation) 210 | db.session.commit() 211 | 212 | def _get_unannotated_query_for_user(user_id, source): 213 | annotated_queries_ids = db.session.query(Annotation.query_id).join(Inquery, Annotation.query_id == Inquery.id)\ 214 | .filter(Annotation.user_id == user_id, Inquery.source == source).distinct() 215 | return Inquery.query.filter(Inquery.source == source, Inquery.id.notin_(annotated_queries_ids)).first() 216 | 217 | def _get_candidates(query, user_id): 218 | recommended_candidates = CandidatesRecommended.query.filter_by(query_id=query.id).all() 219 | recommended_candidates_ids = [candidate.candidate_id for candidate in recommended_candidates] 220 | 221 | annotated_candidates_ids = [annotation.candidate_id for annotation in 222 | Annotation.query.filter_by(query_id=query.id, user_id=user_id).all()] 223 | 224 | other_candidates = Candidate.query.filter(Candidate.id.notin_(recommended_candidates_ids), 225 | Candidate.id.notin_(annotated_candidates_ids)).all() 226 | return recommended_candidates, other_candidates 227 | 228 | def _calculate_annotation_progress(source, user_id): 229 | total_queries = Inquery.query.filter_by(source=source).count() 230 | annotated_queries_ids = db.session.query(Annotation.query_id).join(Inquery, Annotation.query_id == Inquery.id)\ 231 | .filter(Annotation.user_id == user_id, Inquery.source == source).distinct() 232 | annotated_queries_count = annotated_queries_ids.count() 233 | return (annotated_queries_count / total_queries) * 100 234 | 235 | def get_recommended_queries_by_tags(query): 236 | query_tags = [tag.name for tag in query.tags] 237 | 238 | def get_recommended_queries_by_semantic_similarity(query): 239 | query_tags = [tag.name for tag in query.tags] 240 | 241 | similar_inqueries_with_overlap = [] 242 | 243 | # After fetching the query to annotate 244 | threshold = 0.4 245 | similarities = ( 246 | Similarity.query.filter(Similarity.inquery_id == query.id, Similarity.score >= threshold) 247 | .order_by(desc(Similarity.score)) 248 | .all() 249 | ) 250 | for similarity in similarities: 251 | other_inquery = Inquery.query.get(similarity.other_inquery_id) 252 | other_query_tags = [tag.name for tag in other_inquery.tags] 253 | overlap_count = calculate_tag_overlap(query_tags, other_query_tags) 254 | similar_inqueries_with_overlap.append((similarity, overlap_count)) 255 | 256 | # Order the inqueries based on the number of overlapping tags (in descending order) 257 | similar_inqueries_with_overlap.sort(key=lambda x: x[1], reverse=True) 258 | 259 | similar_inquery_data_dict = {} 260 | for similarity, _ in similar_inqueries_with_overlap: 261 | annotations = Annotation.query.filter_by(query_id=similarity.other_inquery_id).all() 262 | if annotations: 263 | for annotation in annotations: 264 | candidate = Candidate.query.get(annotation.candidate_id) 265 | if candidate: 266 | other_inquery = Inquery.query.get(similarity.other_inquery_id) 267 | tags = [tag.name for tag in other_inquery.tags] 268 | 269 | if similarity.other_inquery_id not in similar_inquery_data_dict: 270 | similar_inquery_data_dict[similarity.other_inquery_id] = { 271 | "id": other_inquery.id, 272 | "query": other_inquery.text, 273 | "candidates": [(candidate.text, annotation.relevance)], 274 | "timestamp": annotation.timestamp.strftime("%Y-%m-%d %H:%M:%S"), 275 | "tags": tags, 276 | "score": "{:.2f}".format(similarity.score) 277 | } 278 | else: 279 | similar_inquery_data_dict[similarity.other_inquery_id]['candidates'].append( 280 | (candidate.text, annotation.relevance) 281 | ) 282 | 283 | # Convert dictionary to a list of dictionaries for rendering in the template 284 | similar_inquery_data = list(similar_inquery_data_dict.values()) 285 | 286 | return similar_inquery_data 287 | 288 | @app.route('/user_stats') 289 | @login_required 290 | def user_stats(): 291 | user_annotations = Annotation.query.filter_by(user_id=current_user.id).all() 292 | annotations_data = {} 293 | 294 | for annotation in user_annotations: 295 | try: 296 | if annotation.query_data.id not in annotations_data: 297 | annotations_data[annotation.query_data.id] = { 298 | 'query': annotation.query_data.text, 299 | 'candidates': [(annotation.candidate.text, annotation.relevance)], 300 | 'timestamp': annotation.timestamp.strftime("%Y-%m-%d %H:%M:%S") 301 | } 302 | else: 303 | annotations_data[annotation.query_data.id]['candidates'].append( 304 | (annotation.candidate.text, annotation.relevance)) 305 | except AttributeError as e: 306 | app.logger.error(f'AttributeError for annotation id {annotation.id}: {str(e)}') 307 | 308 | # Convert dictionary to a list of dictionaries for easier handling in the template 309 | annotations_data = list(annotations_data.values()) 310 | 311 | return render_template('user_stats.html', annotations_data=annotations_data) 312 | 313 | @app.route('/precompute_similarity') 314 | def precompute_similarity(): 315 | # Retrieve all inqueries 316 | inqueries = Inquery.query.all() 317 | 318 | # Extract texts from inqueries 319 | texts = [inquery.text for inquery in inqueries] 320 | 321 | # Get embeddings for the texts 322 | embeddings = model.encode(texts) 323 | 324 | # Compute pairwise semantic similarity scores 325 | similarity_matrix = np.inner(embeddings, embeddings) 326 | 327 | # Store the precomputed similarities in the Similarity table 328 | for i in tqdm(range(len(inqueries)), desc="Storing Similarities"): 329 | inquery = inqueries[i] 330 | for j, other_inquery in enumerate(inqueries): 331 | if i != j: # Skip self-similarity 332 | similarity = Similarity(inquery_id=inquery.id, other_inquery_id=other_inquery.id, score=similarity_matrix[i, j]) 333 | db.session.add(similarity) 334 | db.session.commit() 335 | 336 | def get_data_for_all_inqueries(): 337 | # Fetch all inqueries 338 | inqueries = Inquery.query.all() 339 | 340 | all_data = [] 341 | 342 | for inquery in inqueries: 343 | # Fetch all candidates 344 | candidates = Candidate.query.all() 345 | 346 | pattern_list = [] 347 | 348 | for candidate in candidates: 349 | # Check if there's an annotation for the current inquery-candidate pair 350 | annotation = Annotation.query.filter_by(query_id=inquery.id, candidate_id=candidate.id).first() 351 | 352 | if annotation: 353 | relevance = annotation.relevance 354 | else: 355 | relevance = 0 356 | 357 | pattern_data = { 358 | "name": candidate.text, 359 | "rating": int(relevance) 360 | } 361 | pattern_list.append(pattern_data) 362 | 363 | # Format data for this inquery 364 | data = { 365 | "id": inquery.id, 366 | "req_type": [tag.name for tag in inquery.tags], # Convert to list 367 | "req_name": f"{inquery.source} {inquery.id}", 368 | "req_text": inquery.text, 369 | "pattern": pattern_list 370 | } 371 | 372 | all_data.append(data) 373 | 374 | return all_data 375 | 376 | 377 | @app.route('/download_all') 378 | def download_json(): 379 | data = get_data_for_all_inqueries() 380 | if not data: 381 | return "No inqueries found", 404 382 | 383 | # Convert the list of dictionaries to a JSON string 384 | json_data = jsonify(data).get_data(as_text=True) 385 | 386 | # Create a Flask Response with headers for download 387 | response = ResponseBase(json_data, content_type="application/json") 388 | response.headers["Content-Disposition"] = "attachment; filename=all_inqueries.json" 389 | return response 390 | 391 | if __name__ == '__main__': 392 | app.run(debug=True) 393 | 394 | -------------------------------------------------------------------------------- /recommender/feature_creation.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | nltk.download('stopwords') 4 | nltk.download('punkt') 5 | 6 | import json, pickle 7 | import numpy as np 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | from scipy import sparse 10 | import nltk 11 | from nltk.corpus import stopwords 12 | from nltk.tokenize import word_tokenize 13 | from sentence_transformers import SentenceTransformer, util 14 | from text_preprocessing import preprocess_text 15 | from text_preprocessing import to_lower, remove_stopword, lemmatize_word 16 | import torch 17 | import os 18 | 19 | preprocess_functions = [to_lower, remove_stopword, lemmatize_word] 20 | 21 | PARENT_FOLDER = "data/" 22 | 23 | ''' 24 | Construct features for learning-to-rank 25 | The main function is the construct_features(q) which receives input of a query (which in our case a requirement) 26 | then it is computed for each pattern in privacypatterns.org 27 | ''' 28 | 29 | class PrivacyPatternFeatures(object): 30 | def __init__(self): 31 | self.patterns, self.pattern_titles, self.pattern_excerpts = self.get_corpus_pattern() 32 | self.initiate_tf_idf() 33 | self.initiate_bm25(0.75, 1.6) 34 | 35 | print("Loading LTR Embeddings...") 36 | # self.model_sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2') 37 | # self.model_sentence_transformer_overflow = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base') 38 | 39 | self.model_sentence_transformer = SentenceTransformer('all-mpnet-base-v2') 40 | self.model_sentence_transformer_overflow = SentenceTransformer('dean-ai/legal_heBERT_ft') 41 | 42 | # self.precompute_pattern_embeddings() 43 | self.load_pattern_embeddings() 44 | 45 | def construct_features(self, q): 46 | q_words = word_tokenize(self.remove_stopwords(q)) 47 | 48 | # we adapt the representation from MSLR-WEB dataset 49 | # q is query that represents the requirements 50 | # pattern is the document 51 | # each query have the pattern features 52 | # query level feature = when the parameter only contain q 53 | 54 | len_q = len(self.remove_stopwords(q)) 55 | idf_q = self.get_idf(q_words) 56 | tf_idf_q = self.tf_idf_features(q) 57 | bm25 = self.bm25(q) 58 | binary_q, multi_q = self.class_features([q]) 59 | binary_pattern, multi_pattern = self.class_features(self.patterns) 60 | 61 | cosine_pattern, cosine_title, cosine_excerpt, cosine_pattern_overflow, cosine_title_overflow, cosine_excerpt_overflow = self.semantic_similarity_features(q) 62 | deep_semantic_features = self.deep_semantic_interaction_features(q) 63 | 64 | features_all = [] 65 | for i, pattern in enumerate(self.patterns): 66 | features = [] 67 | features.extend(self.number_of_covered_words(q_words, pattern)) # 1, 2 68 | features.append(len_q) # 3 69 | features.append(idf_q) # 4 70 | features.extend(self.tf_features(q_words, pattern)) # 5 - 14 71 | features.extend(tf_idf_q) # 15 - 19 72 | features.append(bm25[i]) # 20 73 | features.append(float(cosine_pattern[0][i])) # 21 74 | features.append(float(cosine_title[0][i])) # 22 75 | features.append(float(cosine_excerpt[0][i])) # 23 76 | features.append(float(cosine_pattern_overflow[0][i])) # 24 77 | features.append(float(cosine_title_overflow[0][i])) # 25 78 | features.append(float(cosine_excerpt_overflow[0][i])) # 26 79 | 80 | features.append(binary_q[0]) 81 | features.append(multi_q[0]) 82 | features.append(binary_pattern[i]) 83 | features.append(multi_pattern[i]) 84 | 85 | # Append deep semantic interaction features: similarities 86 | features.append(float(deep_semantic_features["similarities"][0][0][i])) # pattern similarity 87 | features.append(float(deep_semantic_features["similarities"][1][0][i])) # title similarity 88 | features.append(float(deep_semantic_features["similarities"][2][0][i])) # excerpt similarity 89 | features.append(float(deep_semantic_features["similarities"][3][0][i])) # pattern similarity (overflow model) 90 | features.append(float(deep_semantic_features["similarities"][4][0][i])) # title similarity (overflow model) 91 | features.append(float(deep_semantic_features["similarities"][5][0][i])) # excerpt similarity (overflow model) 92 | 93 | # Append deep semantic interaction features: hadamard products 94 | features.extend(deep_semantic_features["hadamard_products"][0][i].tolist()) # pattern 95 | features.extend(deep_semantic_features["hadamard_products"][1][i].tolist()) # title 96 | features.extend(deep_semantic_features["hadamard_products"][2][i].tolist()) # excerpt 97 | features.extend(deep_semantic_features["hadamard_products"][3][i].tolist()) # pattern (overflow model) 98 | features.extend(deep_semantic_features["hadamard_products"][4][i].tolist()) # title (overflow model) 99 | features.extend(deep_semantic_features["hadamard_products"][5][i].tolist()) # excerpt (overflow model) 100 | 101 | # # Append deep semantic interaction features: concatenation 102 | features.extend(deep_semantic_features["concatenations"][0][i].tolist()) # pattern 103 | features.extend(deep_semantic_features["concatenations"][1][i].tolist()) # title 104 | features.extend(deep_semantic_features["concatenations"][2][i].tolist()) # excerpt 105 | features.extend(deep_semantic_features["concatenations"][3][i].tolist()) # pattern (overflow model) 106 | features.extend(deep_semantic_features["concatenations"][4][i].tolist()) # title (overflow model) 107 | features.extend(deep_semantic_features["concatenations"][5][i].tolist()) # excerpt (overflow model) 108 | 109 | features_all.append(features) 110 | 111 | return features_all 112 | 113 | def get_corpus_pattern(self): 114 | pattern_file= PARENT_FOLDER + "patterns.json" 115 | X = [] 116 | title = [] 117 | excerpt = [] 118 | with open(pattern_file, 'r') as p: 119 | patterns = json.loads(p.read()) 120 | 121 | for pattern in patterns: 122 | text = "" 123 | 124 | filename = pattern["filename"].replace(".md","").replace("-"," ") 125 | 126 | title.append(filename) 127 | excerpt.append(pattern["excerpt"].strip()) 128 | 129 | text += filename 130 | if not text.endswith("."): 131 | text += ". " 132 | 133 | text += pattern["excerpt"].strip() 134 | if not text.endswith("."): 135 | text += ". " 136 | 137 | for heading in pattern["heading"]: 138 | text += heading["content"].strip() 139 | if not text.endswith("."): 140 | text += ". " 141 | 142 | X.append(text) 143 | 144 | X_new, title_new, excerpt_new = self.get_new_patterns() 145 | 146 | X.extend(X_new) 147 | title.extend(title_new) 148 | excerpt.extend(excerpt_new) 149 | 150 | return X, title, excerpt 151 | 152 | 153 | def get_new_patterns(self): 154 | pattern_file= PARENT_FOLDER + "patterns_new.json" 155 | X = [] 156 | title = [] 157 | excerpt = [] 158 | with open(pattern_file, 'r') as p: 159 | patterns = json.loads(p.read()) 160 | 161 | for pattern in patterns: 162 | X.append(pattern["description"]) 163 | title.append(pattern["title"]) 164 | excerpt.append(pattern["excerpt"]) 165 | 166 | return X, title, excerpt 167 | 168 | def remove_stopwords(self, q): 169 | stop_words = set(stopwords.words('english')) 170 | word_tokens = word_tokenize(q) 171 | filtered_sentence = " ".join([w for w in word_tokens if not w.lower() in stop_words]) 172 | 173 | return filtered_sentence 174 | 175 | 176 | def initiate_tf_idf(self): 177 | self.tf_idf_vectorizer = TfidfVectorizer(norm=None, smooth_idf=False) 178 | self.tf_idf_vectorizer.fit(self.patterns) 179 | self.tf_idf_feature_names = self.tf_idf_vectorizer.get_feature_names_out() 180 | 181 | 182 | def initiate_bm25(self, b, k1): 183 | self.b = b 184 | self.k1 = k1 185 | 186 | y = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(self.patterns) 187 | self.avdl = y.sum(1).mean() 188 | 189 | 190 | def bm25(self, q): 191 | X = self.patterns 192 | """ Calculate BM25 between query q and documents X """ 193 | b, k1, avdl = self.b, self.k1, self.avdl 194 | 195 | # apply CountVectorizer 196 | X = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(X) 197 | len_X = X.sum(1).A1 198 | q, = super(TfidfVectorizer, self.tf_idf_vectorizer).transform([q]) 199 | assert sparse.isspmatrix_csr(q) 200 | 201 | # convert to csc for better column slicing 202 | X = X.tocsc()[:, q.indices] 203 | denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None] 204 | # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted 205 | # to idf(t) = log [ n / df(t) ] with minus 1 206 | idf = self.tf_idf_vectorizer._tfidf.idf_[None, q.indices] - 1. 207 | numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1) 208 | return (numer / denom).sum(1).A1 209 | 210 | def number_of_covered_words(self, q_words, pattern): 211 | # How many terms in the user query are covered by the text. 212 | # ration = Covered query term number divided by the number of query terms. 213 | 214 | n = 0 215 | for word in q_words: 216 | if word.lower() in pattern.lower(): 217 | n += 1 218 | 219 | ratio = n/len(q_words) 220 | return [n, ratio] 221 | 222 | def get_idf(self, q_words): 223 | # 1 divided by the number of documents containing the query terms. 224 | 225 | n = 0 226 | word_in_patterns = set() 227 | for pattern in self.patterns: 228 | for word in q_words: 229 | if word.lower() in pattern.lower(): 230 | word_in_patterns.add(word.lower()) 231 | 232 | if len(list(word_in_patterns)) == 0: 233 | return 0 234 | 235 | idf = 1/len(list(word_in_patterns)) 236 | 237 | return idf 238 | 239 | def tf_features(self, q_words, pattern): 240 | # Sum, Min, Max, Average, Variance of counts of each query term in the document. 241 | # Normalized version : term counts divided by text length 242 | 243 | pattern_words = word_tokenize(pattern) 244 | total_len = len(pattern_words) 245 | n_count_all = [pattern_words.count(word) for word in q_words] 246 | 247 | tf_sum, tf_min, tf_max, tf_avg, tf_var = sum(n_count_all), min(n_count_all), max(n_count_all), np.average(n_count_all), np.var(n_count_all) 248 | 249 | norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var = sum(n_count_all)/total_len, min(n_count_all)/total_len, max(n_count_all)/total_len, np.average(n_count_all)/total_len, np.var(n_count_all)/float(total_len) 250 | 251 | return [tf_sum, tf_min, tf_max, tf_avg, tf_var, norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var] 252 | 253 | 254 | def tf_idf_features(self, q): 255 | tfidf_matrix= self.tf_idf_vectorizer.transform([q]).todense() 256 | feature_index = tfidf_matrix[0,:].nonzero()[1] 257 | tfidf_scores = zip([self.tf_idf_feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index]) 258 | 259 | word_scores = [score for score in dict(tfidf_scores).values()] 260 | 261 | tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var = sum(word_scores), min(word_scores), max(word_scores), np.average(word_scores), np.var(word_scores) 262 | 263 | return [tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var] 264 | 265 | def precompute_pattern_embeddings(self): 266 | PARENT_FOLDER = "" 267 | print("Precompute Pattern Embeddings") 268 | # Compute embeddings for patterns 269 | self.emb_pattern = self.model_sentence_transformer.encode(self.patterns, convert_to_tensor=True) 270 | self.emb_pattern_title = self.model_sentence_transformer.encode(self.pattern_titles, convert_to_tensor=True) 271 | self.emb_pattern_excerpt = self.model_sentence_transformer.encode(self.pattern_excerpts, convert_to_tensor=True) 272 | 273 | self.emb_pattern_overflow = self.model_sentence_transformer_overflow.encode(self.patterns, convert_to_tensor=True) 274 | self.emb_pattern_title_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_titles, convert_to_tensor=True) 275 | self.emb_pattern_excerpt_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_excerpts, convert_to_tensor=True) 276 | 277 | # Save the embeddings for later use 278 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl', 'wb') as f: 279 | pickle.dump(self.emb_pattern, f) 280 | 281 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title.pkl', 'wb') as f: 282 | pickle.dump(self.emb_pattern_title, f) 283 | 284 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt.pkl', 'wb') as f: 285 | pickle.dump(self.emb_pattern_excerpt, f) 286 | 287 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_overflow.pkl', 'wb') as f: 288 | pickle.dump(self.emb_pattern_overflow, f) 289 | 290 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title_overflow.pkl', 'wb') as f: 291 | pickle.dump(self.emb_pattern_title_overflow, f) 292 | 293 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt_overflow.pkl', 'wb') as f: 294 | pickle.dump(self.emb_pattern_excerpt_overflow, f) 295 | 296 | def load_pattern_embeddings(self): 297 | # Load the embeddings from the saved files 298 | with open('LTR_resources/emb_pattern.pkl', 'rb') as f: 299 | self.emb_pattern = pickle.load(f) 300 | 301 | with open('LTR_resources/emb_pattern_title.pkl', 'rb') as f: 302 | self.emb_pattern_title = pickle.load(f) 303 | 304 | with open('LTR_resources/emb_pattern_excerpt.pkl', 'rb') as f: 305 | self.emb_pattern_excerpt = pickle.load(f) 306 | 307 | with open('LTR_resources/emb_pattern_overflow.pkl', 'rb') as f: 308 | self.emb_pattern_overflow = pickle.load(f) 309 | 310 | with open('LTR_resources/emb_pattern_title_overflow.pkl', 'rb') as f: 311 | self.emb_pattern_title_overflow = pickle.load(f) 312 | 313 | with open('LTR_resources/emb_pattern_excerpt_overflow.pkl', 'rb') as f: 314 | self.emb_pattern_excerpt_overflow = pickle.load(f) 315 | 316 | def semantic_similarity_features(self, q): 317 | emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True) 318 | 319 | cosine_scores_pattern = util.cos_sim(emb_q, self.emb_pattern) 320 | cosine_scores_title = util.cos_sim(emb_q, self.emb_pattern_title) 321 | cosine_scores_excerpt = util.cos_sim(emb_q, self.emb_pattern_excerpt) 322 | 323 | emb_q = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True) 324 | 325 | cosine_scores_pattern_overflow = util.cos_sim(emb_q, self.emb_pattern_overflow) 326 | cosine_scores_title_overflow = util.cos_sim(emb_q, self.emb_pattern_title_overflow) 327 | cosine_scores_excerpt_overflow = util.cos_sim(emb_q, self.emb_pattern_excerpt_overflow) 328 | 329 | return cosine_scores_pattern, cosine_scores_title, cosine_scores_excerpt, cosine_scores_pattern_overflow, cosine_scores_title_overflow, cosine_scores_excerpt_overflow 330 | 331 | def hadamard_product(self, tensor1, tensor2): 332 | return tensor1 * tensor2 333 | 334 | def deep_semantic_interaction_features(self, q): 335 | # Encode the query 336 | emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True) 337 | emb_q_overflow = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True) 338 | 339 | # Compute the "ideal" similarity, which is the query with itself 340 | ideal_emb = self.model_sentence_transformer.encode([q + " [SEP] " + q], convert_to_tensor=True) 341 | ideal_emb_overflow = self.model_sentence_transformer_overflow.encode([q + " [SEP] " + q], convert_to_tensor=True) 342 | 343 | # Compute Hadamard product between query and pattern embeddings 344 | hadamard_emb_pattern = self.hadamard_product(emb_q, self.emb_pattern) 345 | hadamard_emb_pattern_title = self.hadamard_product(emb_q, self.emb_pattern_title) 346 | hadamard_emb_pattern_excerpt = self.hadamard_product(emb_q, self.emb_pattern_excerpt) 347 | 348 | hadamard_emb_pattern_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_overflow) 349 | hadamard_emb_pattern_title_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_title_overflow) 350 | hadamard_emb_pattern_excerpt_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_excerpt_overflow) 351 | 352 | # Compute Concatenation between query and pattern embeddings 353 | concat_emb_pattern = torch.cat((emb_q.unsqueeze(0), self.emb_pattern), dim=0) 354 | concat_emb_pattern_title = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_title), dim=0) 355 | concat_emb_pattern_excerpt = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_excerpt), dim=0) 356 | 357 | concat_emb_pattern_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_overflow), dim=0) 358 | concat_emb_pattern_title_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_title_overflow), dim=0) 359 | concat_emb_pattern_excerpt_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_excerpt_overflow), dim=0) 360 | 361 | # Compute similarity between the query embeddings and the precomputed pattern embeddings 362 | similarities_pattern = util.pytorch_cos_sim(emb_q, self.emb_pattern) 363 | similarities_title = util.pytorch_cos_sim(emb_q, self.emb_pattern_title) 364 | similarities_excerpt = util.pytorch_cos_sim(emb_q, self.emb_pattern_excerpt) 365 | 366 | similarities_pattern_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_overflow) 367 | similarities_title_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_title_overflow) 368 | similarities_excerpt_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_excerpt_overflow) 369 | 370 | # Returning features which include the similarities, the hadamard product embeddings, and concatenations 371 | return { 372 | "similarities": (similarities_pattern, similarities_title, similarities_excerpt, 373 | similarities_pattern_overflow, similarities_title_overflow, similarities_excerpt_overflow), 374 | "hadamard_products": (hadamard_emb_pattern, hadamard_emb_pattern_title, hadamard_emb_pattern_excerpt, 375 | hadamard_emb_pattern_overflow, hadamard_emb_pattern_title_overflow, hadamard_emb_pattern_excerpt_overflow), 376 | "concatenations": (concat_emb_pattern, concat_emb_pattern_title, concat_emb_pattern_excerpt, 377 | concat_emb_pattern_overflow, concat_emb_pattern_title_overflow, concat_emb_pattern_excerpt_overflow) 378 | } 379 | 380 | 381 | 382 | def predict_class(self, texts, model, vectorizer): 383 | loaded_model = pickle.load(open(model, 'rb')) 384 | loaded_vect = pickle.load(open(vectorizer, 'rb')) 385 | 386 | text = [preprocess_text(t, preprocess_functions) for t in texts] 387 | v_text = loaded_vect.transform(text) 388 | 389 | prediction = loaded_model.predict(v_text) 390 | 391 | return prediction 392 | 393 | def class_features(self, texts): 394 | PARENT_FOLDER = "classification_model/" 395 | 396 | # BINARY CLASS 397 | binary_prediction = self.predict_class(texts, PARENT_FOLDER + "binary_nb_model.sav", PARENT_FOLDER + "binary_vectorizer_model.sav") 398 | 399 | # MULTI CLASS 400 | multi_prediction = self.predict_class(texts,PARENT_FOLDER + "multi_nb_model.sav",PARENT_FOLDER + "multi_vectorizer_model.sav") 401 | 402 | return binary_prediction, multi_prediction 403 | -------------------------------------------------------------------------------- /letor/feature_creation.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | 3 | nltk.download('stopwords') 4 | nltk.download('punkt') 5 | 6 | import json, pickle 7 | import numpy as np 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | from scipy import sparse 10 | import nltk 11 | from nltk.corpus import stopwords 12 | from nltk.tokenize import word_tokenize 13 | from sentence_transformers import SentenceTransformer, util 14 | from text_preprocessing import preprocess_text 15 | from text_preprocessing import to_lower, remove_stopword, lemmatize_word 16 | from transformers import BertTokenizer, BertModel 17 | import torch 18 | import os 19 | import time 20 | 21 | preprocess_functions = [to_lower, remove_stopword, lemmatize_word] 22 | 23 | PARENT_FOLDER = "" 24 | 25 | ''' 26 | Construct features for learning-to-rank 27 | The main function is the construct_features(q) which receives input of a query (which in our case a requirement) 28 | then it is computed for each pattern in privacypatterns.org 29 | ''' 30 | 31 | class PrivacyPatternFeatures(object): 32 | def __init__(self): 33 | self.patterns, self.pattern_titles, self.pattern_excerpts = self.get_corpus_pattern() 34 | self.initiate_tf_idf() 35 | self.initiate_bm25(0.75, 1.6) 36 | 37 | print("Loading LTR Embeddings...") 38 | 39 | self.model_sentence_transformer = SentenceTransformer('all-mpnet-base-v2') 40 | self.model_sentence_transformer_overflow = SentenceTransformer('dean-ai/legal_heBERT_ft') 41 | 42 | self.emb_pattern_file = PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl' 43 | if os.path.isfile(self.emb_pattern_file): 44 | self.load_pattern_embeddings() 45 | else: 46 | self.precompute_pattern_embeddings() 47 | 48 | def construct_features(self, q): 49 | q_words = word_tokenize(self.remove_stopwords(q)) 50 | 51 | # we adapt the representation from MSLR-WEB dataset 52 | # q is query that represents the requirements 53 | # pattern is the document 54 | # each query have the pattern features 55 | # query level feature = when the parameter only contain q 56 | 57 | len_q = len(self.remove_stopwords(q)) 58 | idf_q = self.get_idf(q_words) 59 | tf_idf_q = self.tf_idf_features(q) 60 | bm25 = self.bm25(q) 61 | binary_q, multi_q = self.class_features([q]) 62 | binary_pattern, multi_pattern = self.class_features(self.patterns) 63 | 64 | cosine_pattern, cosine_title, cosine_excerpt, cosine_pattern_overflow, cosine_title_overflow, cosine_excerpt_overflow = self.semantic_similarity_features(q) 65 | deep_semantic_features = self.deep_semantic_interaction_features(q) 66 | 67 | features_all = [] 68 | for i, pattern in enumerate(self.patterns): 69 | features = [] 70 | features.extend(self.number_of_covered_words(q_words, pattern)) # 1, 2 71 | features.append(len_q) # 3 72 | features.append(idf_q) # 4 73 | features.extend(self.tf_features(q_words, pattern)) # 5 - 14 74 | features.extend(tf_idf_q) # 15 - 19 75 | features.append(bm25[i]) # 20 76 | features.append(float(cosine_pattern[0][i])) # 21 77 | features.append(float(cosine_title[0][i])) # 22 78 | features.append(float(cosine_excerpt[0][i])) # 23 79 | features.append(float(cosine_pattern_overflow[0][i])) # 24 80 | features.append(float(cosine_title_overflow[0][i])) # 25 81 | features.append(float(cosine_excerpt_overflow[0][i])) # 26 82 | 83 | features.append(binary_q[0]) 84 | features.append(multi_q[0]) 85 | features.append(binary_pattern[i]) 86 | features.append(multi_pattern[i]) 87 | 88 | # Append deep semantic interaction features: similarities 89 | features.append(float(deep_semantic_features["similarities"][0][0][i])) # pattern similarity 90 | features.append(float(deep_semantic_features["similarities"][1][0][i])) # title similarity 91 | features.append(float(deep_semantic_features["similarities"][2][0][i])) # excerpt similarity 92 | features.append(float(deep_semantic_features["similarities"][3][0][i])) # pattern similarity (overflow model) 93 | features.append(float(deep_semantic_features["similarities"][4][0][i])) # title similarity (overflow model) 94 | features.append(float(deep_semantic_features["similarities"][5][0][i])) # excerpt similarity (overflow model) 95 | 96 | # Append deep semantic interaction features: hadamard products 97 | features.extend(deep_semantic_features["hadamard_products"][0][i].tolist()) # pattern 98 | features.extend(deep_semantic_features["hadamard_products"][1][i].tolist()) # title 99 | features.extend(deep_semantic_features["hadamard_products"][2][i].tolist()) # excerpt 100 | features.extend(deep_semantic_features["hadamard_products"][3][i].tolist()) # pattern (overflow model) 101 | features.extend(deep_semantic_features["hadamard_products"][4][i].tolist()) # title (overflow model) 102 | features.extend(deep_semantic_features["hadamard_products"][5][i].tolist()) # excerpt (overflow model) 103 | 104 | # # Append deep semantic interaction features: concatenation 105 | features.extend(deep_semantic_features["concatenations"][0][i].tolist()) # pattern 106 | features.extend(deep_semantic_features["concatenations"][1][i].tolist()) # title 107 | features.extend(deep_semantic_features["concatenations"][2][i].tolist()) # excerpt 108 | features.extend(deep_semantic_features["concatenations"][3][i].tolist()) # pattern (overflow model) 109 | features.extend(deep_semantic_features["concatenations"][4][i].tolist()) # title (overflow model) 110 | features.extend(deep_semantic_features["concatenations"][5][i].tolist()) # excerpt (overflow model) 111 | 112 | features_all.append(features) 113 | 114 | return features_all 115 | 116 | def get_corpus_pattern(self): 117 | pattern_file= PARENT_FOLDER + "patterns.json" 118 | X = [] 119 | title = [] 120 | excerpt = [] 121 | with open(pattern_file, 'r') as p: 122 | patterns = json.loads(p.read()) 123 | 124 | for pattern in patterns: 125 | text = "" 126 | 127 | filename = pattern["filename"].replace(".md","").replace("-"," ") 128 | 129 | title.append(filename) 130 | excerpt.append(pattern["excerpt"].strip()) 131 | 132 | text += filename 133 | if not text.endswith("."): 134 | text += ". " 135 | 136 | text += pattern["excerpt"].strip() 137 | if not text.endswith("."): 138 | text += ". " 139 | 140 | for heading in pattern["heading"]: 141 | text += heading["content"].strip() 142 | if not text.endswith("."): 143 | text += ". " 144 | 145 | X.append(text) 146 | 147 | X_new, title_new, excerpt_new = self.get_new_patterns() 148 | 149 | X.extend(X_new) 150 | title.extend(title_new) 151 | excerpt.extend(excerpt_new) 152 | 153 | return X, title, excerpt 154 | 155 | 156 | def get_new_patterns(self): 157 | pattern_file= PARENT_FOLDER + "patterns_new.json" 158 | X = [] 159 | title = [] 160 | excerpt = [] 161 | with open(pattern_file, 'r') as p: 162 | patterns = json.loads(p.read()) 163 | 164 | for pattern in patterns: 165 | X.append(pattern["description"]) 166 | title.append(pattern["title"]) 167 | excerpt.append(pattern["excerpt"]) 168 | 169 | return X, title, excerpt 170 | 171 | def remove_stopwords(self, q): 172 | stop_words = set(stopwords.words('english')) 173 | word_tokens = word_tokenize(q) 174 | filtered_sentence = " ".join([w for w in word_tokens if not w.lower() in stop_words]) 175 | 176 | return filtered_sentence 177 | 178 | 179 | def initiate_tf_idf(self): 180 | self.tf_idf_vectorizer = TfidfVectorizer(norm=None, smooth_idf=False) 181 | self.tf_idf_vectorizer.fit(self.patterns) 182 | self.tf_idf_feature_names = self.tf_idf_vectorizer.get_feature_names_out() 183 | 184 | 185 | def initiate_bm25(self, b, k1): 186 | self.b = b 187 | self.k1 = k1 188 | 189 | y = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(self.patterns) 190 | self.avdl = y.sum(1).mean() 191 | 192 | 193 | def bm25(self, q): 194 | X = self.patterns 195 | """ Calculate BM25 between query q and documents X """ 196 | b, k1, avdl = self.b, self.k1, self.avdl 197 | 198 | # apply CountVectorizer 199 | X = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(X) 200 | len_X = X.sum(1).A1 201 | q, = super(TfidfVectorizer, self.tf_idf_vectorizer).transform([q]) 202 | assert sparse.isspmatrix_csr(q) 203 | 204 | # convert to csc for better column slicing 205 | X = X.tocsc()[:, q.indices] 206 | denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None] 207 | # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted 208 | # to idf(t) = log [ n / df(t) ] with minus 1 209 | idf = self.tf_idf_vectorizer._tfidf.idf_[None, q.indices] - 1. 210 | numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1) 211 | return (numer / denom).sum(1).A1 212 | 213 | def number_of_covered_words(self, q_words, pattern): 214 | # How many terms in the user query are covered by the text. 215 | # ration = Covered query term number divided by the number of query terms. 216 | 217 | n = 0 218 | for word in q_words: 219 | if word.lower() in pattern.lower(): 220 | n += 1 221 | 222 | ratio = n/len(q_words) 223 | return [n, ratio] 224 | 225 | def get_idf(self, q_words): 226 | # 1 divided by the number of documents containing the query terms. 227 | 228 | n = 0 229 | word_in_patterns = set() 230 | for pattern in self.patterns: 231 | for word in q_words: 232 | if word.lower() in pattern.lower(): 233 | word_in_patterns.add(word.lower()) 234 | 235 | if len(list(word_in_patterns)) == 0: 236 | return 0 237 | 238 | idf = 1/len(list(word_in_patterns)) 239 | 240 | return idf 241 | 242 | def tf_features(self, q_words, pattern): 243 | # Sum, Min, Max, Average, Variance of counts of each query term in the document. 244 | # Normalized version : term counts divided by text length 245 | 246 | pattern_words = word_tokenize(pattern) 247 | total_len = len(pattern_words) 248 | n_count_all = [pattern_words.count(word) for word in q_words] 249 | 250 | tf_sum, tf_min, tf_max, tf_avg, tf_var = sum(n_count_all), min(n_count_all), max(n_count_all), np.average(n_count_all), np.var(n_count_all) 251 | 252 | norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var = sum(n_count_all)/total_len, min(n_count_all)/total_len, max(n_count_all)/total_len, np.average(n_count_all)/total_len, np.var(n_count_all)/float(total_len) 253 | 254 | return [tf_sum, tf_min, tf_max, tf_avg, tf_var, norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var] 255 | 256 | 257 | def tf_idf_features(self, q): 258 | tfidf_matrix= self.tf_idf_vectorizer.transform([q]).todense() 259 | feature_index = tfidf_matrix[0,:].nonzero()[1] 260 | tfidf_scores = zip([self.tf_idf_feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index]) 261 | 262 | word_scores = [score for score in dict(tfidf_scores).values()] 263 | 264 | tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var = sum(word_scores), min(word_scores), max(word_scores), np.average(word_scores), np.var(word_scores) 265 | 266 | return [tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var] 267 | 268 | def load_pattern_embeddings(self): 269 | # Load the embeddings from the saved files 270 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl', 'rb') as f: 271 | self.emb_pattern = pickle.load(f) 272 | 273 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title.pkl', 'rb') as f: 274 | self.emb_pattern_title = pickle.load(f) 275 | 276 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt.pkl', 'rb') as f: 277 | self.emb_pattern_excerpt = pickle.load(f) 278 | 279 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_overflow.pkl', 'rb') as f: 280 | self.emb_pattern_overflow = pickle.load(f) 281 | 282 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title_overflow.pkl', 'rb') as f: 283 | self.emb_pattern_title_overflow = pickle.load(f) 284 | 285 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt_overflow.pkl', 'rb') as f: 286 | self.emb_pattern_excerpt_overflow = pickle.load(f) 287 | 288 | def precompute_pattern_embeddings(self): 289 | print("Precompute Pattern Embeddings") 290 | # Compute embeddings for patterns 291 | self.emb_pattern = self.model_sentence_transformer.encode(self.patterns, convert_to_tensor=True) 292 | self.emb_pattern_title = self.model_sentence_transformer.encode(self.pattern_titles, convert_to_tensor=True) 293 | self.emb_pattern_excerpt = self.model_sentence_transformer.encode(self.pattern_excerpts, convert_to_tensor=True) 294 | 295 | self.emb_pattern_overflow = self.model_sentence_transformer_overflow.encode(self.patterns, convert_to_tensor=True) 296 | self.emb_pattern_title_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_titles, convert_to_tensor=True) 297 | self.emb_pattern_excerpt_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_excerpts, convert_to_tensor=True) 298 | 299 | # Save the embeddings for later use 300 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl', 'wb') as f: 301 | pickle.dump(self.emb_pattern, f) 302 | 303 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title.pkl', 'wb') as f: 304 | pickle.dump(self.emb_pattern_title, f) 305 | 306 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt.pkl', 'wb') as f: 307 | pickle.dump(self.emb_pattern_excerpt, f) 308 | 309 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_overflow.pkl', 'wb') as f: 310 | pickle.dump(self.emb_pattern_overflow, f) 311 | 312 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title_overflow.pkl', 'wb') as f: 313 | pickle.dump(self.emb_pattern_title_overflow, f) 314 | 315 | with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt_overflow.pkl', 'wb') as f: 316 | pickle.dump(self.emb_pattern_excerpt_overflow, f) 317 | 318 | def semantic_similarity_features(self, q): 319 | emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True) 320 | 321 | cosine_scores_pattern = util.cos_sim(emb_q, self.emb_pattern) 322 | cosine_scores_title = util.cos_sim(emb_q, self.emb_pattern_title) 323 | cosine_scores_excerpt = util.cos_sim(emb_q, self.emb_pattern_excerpt) 324 | 325 | emb_q = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True) 326 | 327 | cosine_scores_pattern_overflow = util.cos_sim(emb_q, self.emb_pattern_overflow) 328 | cosine_scores_title_overflow = util.cos_sim(emb_q, self.emb_pattern_title_overflow) 329 | cosine_scores_excerpt_overflow = util.cos_sim(emb_q, self.emb_pattern_excerpt_overflow) 330 | 331 | return cosine_scores_pattern, cosine_scores_title, cosine_scores_excerpt, cosine_scores_pattern_overflow, cosine_scores_title_overflow, cosine_scores_excerpt_overflow 332 | 333 | def hadamard_product(self, tensor1, tensor2): 334 | return tensor1 * tensor2 335 | 336 | def deep_semantic_interaction_features(self, q): 337 | # Encode the query 338 | emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True) 339 | emb_q_overflow = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True) 340 | 341 | # Compute the "ideal" similarity, which is the query with itself 342 | ideal_emb = self.model_sentence_transformer.encode([q + " [SEP] " + q], convert_to_tensor=True) 343 | ideal_emb_overflow = self.model_sentence_transformer_overflow.encode([q + " [SEP] " + q], convert_to_tensor=True) 344 | 345 | # Compute Hadamard product between query and pattern embeddings 346 | hadamard_emb_pattern = self.hadamard_product(emb_q, self.emb_pattern) 347 | hadamard_emb_pattern_title = self.hadamard_product(emb_q, self.emb_pattern_title) 348 | hadamard_emb_pattern_excerpt = self.hadamard_product(emb_q, self.emb_pattern_excerpt) 349 | 350 | hadamard_emb_pattern_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_overflow) 351 | hadamard_emb_pattern_title_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_title_overflow) 352 | hadamard_emb_pattern_excerpt_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_excerpt_overflow) 353 | 354 | # Compute Concatenation between query and pattern embeddings 355 | concat_emb_pattern = torch.cat((emb_q.unsqueeze(0), self.emb_pattern), dim=0) 356 | concat_emb_pattern_title = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_title), dim=0) 357 | concat_emb_pattern_excerpt = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_excerpt), dim=0) 358 | 359 | concat_emb_pattern_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_overflow), dim=0) 360 | concat_emb_pattern_title_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_title_overflow), dim=0) 361 | concat_emb_pattern_excerpt_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_excerpt_overflow), dim=0) 362 | 363 | # Compute similarity between the query embeddings and the precomputed pattern embeddings 364 | similarities_pattern = util.pytorch_cos_sim(emb_q, self.emb_pattern) 365 | similarities_title = util.pytorch_cos_sim(emb_q, self.emb_pattern_title) 366 | similarities_excerpt = util.pytorch_cos_sim(emb_q, self.emb_pattern_excerpt) 367 | 368 | similarities_pattern_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_overflow) 369 | similarities_title_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_title_overflow) 370 | similarities_excerpt_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_excerpt_overflow) 371 | 372 | # Returning features which include the similarities, the hadamard product embeddings, and concatenations 373 | return { 374 | "similarities": (similarities_pattern, similarities_title, similarities_excerpt, 375 | similarities_pattern_overflow, similarities_title_overflow, similarities_excerpt_overflow), 376 | "hadamard_products": (hadamard_emb_pattern, hadamard_emb_pattern_title, hadamard_emb_pattern_excerpt, 377 | hadamard_emb_pattern_overflow, hadamard_emb_pattern_title_overflow, hadamard_emb_pattern_excerpt_overflow), 378 | "concatenations": (concat_emb_pattern, concat_emb_pattern_title, concat_emb_pattern_excerpt, 379 | concat_emb_pattern_overflow, concat_emb_pattern_title_overflow, concat_emb_pattern_excerpt_overflow) 380 | } 381 | 382 | 383 | 384 | def predict_class(self, texts, model, vectorizer): 385 | loaded_model = pickle.load(open(model, 'rb')) 386 | loaded_vect = pickle.load(open(vectorizer, 'rb')) 387 | 388 | text = [preprocess_text(t, preprocess_functions) for t in texts] 389 | v_text = loaded_vect.transform(text) 390 | 391 | prediction = loaded_model.predict(v_text) 392 | 393 | return prediction 394 | 395 | def class_features(self, texts): 396 | PARENT_FOLDER = "" 397 | 398 | # BINARY CLASS 399 | binary_prediction = self.predict_class(texts, PARENT_FOLDER + "binary_nb_model.sav", PARENT_FOLDER + "binary_vectorizer_model.sav") 400 | 401 | # MULTI CLASS 402 | multi_prediction = self.predict_class(texts,PARENT_FOLDER + "multi_nb_model.sav",PARENT_FOLDER + "multi_vectorizer_model.sav") 403 | 404 | return binary_prediction, multi_prediction 405 | 406 | def process_fold_data(fold_type, fold_num, pattern_file_path, base_path, cache={}): 407 | """ 408 | Processes the given fold data (train or test) and writes the output to a file. 409 | 410 | Parameters: 411 | - fold_type (str): Either 'train' or 'test'. 412 | - fold_num (int): The fold number (1-5). 413 | - pattern_file_path (str): Path to the patterns file. 414 | - base_path (str): Base path for input and output files. 415 | """ 416 | pp = PrivacyPatternFeatures() 417 | 418 | with open(pattern_file_path, 'r') as p: 419 | patterns = json.loads(p.read()) 420 | 421 | pattern_name = [pattern["title"].replace(".md", "") for i, pattern in enumerate(patterns)] 422 | 423 | with open(base_path + f"{fold_type}_patterns_req_v2_fold_{fold_num}.json", 'r', encoding="utf-8") as p: 424 | patterns_requirements = json.loads(p.read()) 425 | 426 | lines = [] 427 | for pr in patterns_requirements: 428 | print("Query_Id", pr["id"]) 429 | 430 | # Check if features are already calculated for this pr["id"] 431 | if pr["id"] not in cache: 432 | cache[pr["id"]] = pp.construct_features(pr["req_text"]) 433 | 434 | features = cache[pr["id"]] 435 | 436 | for i_pattern, p in enumerate(pr["pattern"]): 437 | idx = pattern_name.index(p["name"]) 438 | 439 | line = "" 440 | line += "{} qid:{}".format(p["rating"], pr["id"]) 441 | 442 | for i_feature, val in enumerate(features[idx]): 443 | line += " {}:{}".format(i_feature+1, val) 444 | 445 | line += " #docid={}".format(p["name"]) 446 | lines.append(line) 447 | 448 | # Save the processed lines to a file specific to the current fold and type (train/test) 449 | with open(base_path + f"{fold_type}_fold_{fold_num}.txt", "w") as f: 450 | for l in lines: 451 | f.write(l + "\n") 452 | 453 | 454 | # Example usage: 455 | base_path = "" 456 | pattern_file = base_path + "patterns.json" 457 | 458 | # Initialize a cache dictionary to store features 459 | features_cache = {} 460 | 461 | # Process all 5 folds for both training and testing data 462 | for fold_num in range(1, 6): 463 | process_fold_data('train', fold_num, pattern_file, base_path, features_cache) 464 | process_fold_data('test', fold_num, pattern_file, base_path, features_cache) 465 | --------------------------------------------------------------------------------