├── annotation-program
    ├── templates
    │   ├── index.html
    │   ├── login.html
    │   ├── user_stats.html
    │   ├── tag.html
    │   ├── home.html
    │   ├── base.html
    │   ├── register.html
    │   └── annotate.html
    ├── migrations
    │   ├── README
    │   ├── __pycache__
    │   │   └── env.cpython-39.pyc
    │   ├── versions
    │   │   ├── __pycache__
    │   │   │   └── 164cfc37a367_added_similarity_model.cpython-39.pyc
    │   │   └── 164cfc37a367_added_similarity_model.py
    │   ├── script.py.mako
    │   ├── alembic.ini
    │   └── env.py
    └── app.py
├── classification
    ├── model
    │   ├── svm_model.sav
    │   ├── binary_nb_model.sav
    │   ├── binary_svm_model.sav
    │   ├── multi_nb_model.sav
    │   ├── multi_svm_model.sav
    │   ├── binary_vectorizer_model.sav
    │   ├── multi_vectorizer_model.sav
    │   └── binary_svm_vectorizer_model.sav
    └── sklearn_classifier.py
├── recommender
    ├── LTR_resources
    │   ├── emb_pattern.pkl
    │   ├── emb_pattern_title.pkl
    │   ├── emb_pattern_excerpt.pkl
    │   ├── emb_pattern_overflow.pkl
    │   ├── emb_pattern_title_overflow.pkl
    │   └── emb_pattern_excerpt_overflow.pkl
    ├── static
    │   └── shap_plots
    │   │   ├── pattern_0.png
    │   │   ├── pattern_1.png
    │   │   ├── pattern_2.png
    │   │   ├── pattern_4.png
    │   │   ├── pattern_5.png
    │   │   ├── pattern_7.png
    │   │   ├── pattern_9.png
    │   │   ├── pattern_10.png
    │   │   ├── pattern_11.png
    │   │   ├── pattern_14.png
    │   │   ├── pattern_16.png
    │   │   ├── pattern_18.png
    │   │   ├── pattern_22.png
    │   │   ├── pattern_25.png
    │   │   ├── pattern_27.png
    │   │   ├── pattern_29.png
    │   │   ├── pattern_30.png
    │   │   ├── pattern_31.png
    │   │   ├── pattern_36.png
    │   │   ├── pattern_37.png
    │   │   ├── pattern_38.png
    │   │   ├── pattern_40.png
    │   │   ├── pattern_41.png
    │   │   ├── pattern_42.png
    │   │   ├── pattern_43.png
    │   │   ├── pattern_44.png
    │   │   ├── pattern_45.png
    │   │   ├── pattern_47.png
    │   │   ├── pattern_49.png
    │   │   ├── pattern_51.png
    │   │   ├── pattern_52.png
    │   │   ├── pattern_53.png
    │   │   ├── pattern_54.png
    │   │   ├── pattern_55.png
    │   │   ├── pattern_56.png
    │   │   ├── pattern_57.png
    │   │   ├── pattern_59.png
    │   │   ├── pattern_61.png
    │   │   ├── pattern_63.png
    │   │   ├── pattern_64.png
    │   │   ├── pattern_66.png
    │   │   ├── pattern_72.png
    │   │   ├── waterfall_pattern_0.png
    │   │   ├── waterfall_pattern_1.png
    │   │   ├── waterfall_pattern_2.png
    │   │   ├── waterfall_pattern_4.png
    │   │   ├── waterfall_pattern_5.png
    │   │   ├── waterfall_pattern_7.png
    │   │   ├── waterfall_pattern_9.png
    │   │   ├── pattern_Lawful Consent.png
    │   │   ├── pattern_Onion Routing.png
    │   │   ├── pattern_Private link.png
    │   │   ├── waterfall_pattern_10.png
    │   │   ├── waterfall_pattern_11.png
    │   │   ├── waterfall_pattern_14.png
    │   │   ├── waterfall_pattern_16.png
    │   │   ├── waterfall_pattern_18.png
    │   │   ├── waterfall_pattern_22.png
    │   │   ├── waterfall_pattern_25.png
    │   │   ├── waterfall_pattern_27.png
    │   │   ├── waterfall_pattern_29.png
    │   │   ├── waterfall_pattern_30.png
    │   │   ├── waterfall_pattern_31.png
    │   │   ├── waterfall_pattern_36.png
    │   │   ├── waterfall_pattern_37.png
    │   │   ├── waterfall_pattern_38.png
    │   │   ├── waterfall_pattern_40.png
    │   │   ├── waterfall_pattern_41.png
    │   │   ├── waterfall_pattern_42.png
    │   │   ├── waterfall_pattern_43.png
    │   │   ├── waterfall_pattern_44.png
    │   │   ├── waterfall_pattern_45.png
    │   │   ├── waterfall_pattern_47.png
    │   │   ├── waterfall_pattern_49.png
    │   │   ├── waterfall_pattern_51.png
    │   │   ├── waterfall_pattern_52.png
    │   │   ├── waterfall_pattern_53.png
    │   │   ├── waterfall_pattern_54.png
    │   │   ├── waterfall_pattern_55.png
    │   │   ├── waterfall_pattern_56.png
    │   │   ├── waterfall_pattern_57.png
    │   │   ├── waterfall_pattern_59.png
    │   │   ├── waterfall_pattern_61.png
    │   │   ├── waterfall_pattern_63.png
    │   │   ├── waterfall_pattern_64.png
    │   │   ├── waterfall_pattern_66.png
    │   │   ├── waterfall_pattern_72.png
    │   │   ├── pattern_Personal Data Store.png
    │   │   ├── pattern_Psuedonymous Identity.png
    │   │   ├── pattern_Selective access control.png
    │   │   ├── pattern_Single Point of Contact.png
    │   │   ├── pattern_Obtaining Explicit Consent.png
    │   │   ├── pattern_Active broadcast of presence.png
    │   │   ├── pattern_Attribute Based Credentials.png
    │   │   ├── pattern_Protection against Tracking.png
    │   │   ├── pattern_[Support] Selective Disclosure.png
    │   │   ├── pattern_Encryption with user-managed keys.png
    │   │   ├── pattern_Anonymous Reputation-based Blacklisting.png
    │   │   └── pattern_Decoupling [content] and location information visibility.png
    ├── classification_model
    │   ├── multi_nb_model.sav
    │   ├── binary_nb_model.sav
    │   ├── binary_vectorizer_model.sav
    │   └── multi_vectorizer_model.sav
    ├── __pycache__
    │   └── feature_creation.cpython-39.pyc
    ├── README.md
    ├── predict_console.py
    ├── templates
    │   └── index.html
    ├── data
    │   └── patterns_new.json
    ├── app.py
    └── feature_creation.py
├── README.md
└── letor
    ├── feature_importance.py
    ├── lightgbm_ltr_train.py
    ├── compute_shap.py
    ├── case_studies.py
    └── feature_creation.py


/annotation-program/templates/index.html:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/annotation-program/migrations/README:
--------------------------------------------------------------------------------
1 | Single-database configuration for Flask.
2 | 


--------------------------------------------------------------------------------
/classification/model/svm_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/svm_model.sav


--------------------------------------------------------------------------------
/classification/model/binary_nb_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_nb_model.sav


--------------------------------------------------------------------------------
/classification/model/binary_svm_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_svm_model.sav


--------------------------------------------------------------------------------
/classification/model/multi_nb_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/multi_nb_model.sav


--------------------------------------------------------------------------------
/classification/model/multi_svm_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/multi_svm_model.sav


--------------------------------------------------------------------------------
/recommender/LTR_resources/emb_pattern.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern.pkl


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_0.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_1.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_2.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_4.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_5.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_7.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_9.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_10.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_11.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_14.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_16.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_18.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_22.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_25.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_27.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_29.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_30.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_31.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_36.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_37.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_38.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_40.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_41.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_42.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_43.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_44.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_45.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_47.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_49.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_51.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_52.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_53.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_54.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_54.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_55.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_56.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_56.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_57.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_59.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_61.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_63.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_63.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_64.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_66.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_72.png


--------------------------------------------------------------------------------
/classification/model/binary_vectorizer_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_vectorizer_model.sav


--------------------------------------------------------------------------------
/classification/model/multi_vectorizer_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/multi_vectorizer_model.sav


--------------------------------------------------------------------------------
/recommender/LTR_resources/emb_pattern_title.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_title.pkl


--------------------------------------------------------------------------------
/recommender/LTR_resources/emb_pattern_excerpt.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_excerpt.pkl


--------------------------------------------------------------------------------
/recommender/LTR_resources/emb_pattern_overflow.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_overflow.pkl


--------------------------------------------------------------------------------
/recommender/classification_model/multi_nb_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/multi_nb_model.sav


--------------------------------------------------------------------------------
/classification/model/binary_svm_vectorizer_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/classification/model/binary_svm_vectorizer_model.sav


--------------------------------------------------------------------------------
/recommender/classification_model/binary_nb_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/binary_nb_model.sav


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_0.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_1.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_2.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_4.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_5.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_7.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_9.png


--------------------------------------------------------------------------------
/recommender/LTR_resources/emb_pattern_title_overflow.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_title_overflow.pkl


--------------------------------------------------------------------------------
/recommender/__pycache__/feature_creation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/__pycache__/feature_creation.cpython-39.pyc


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Lawful Consent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Lawful Consent.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Onion Routing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Onion Routing.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Private link.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Private link.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_10.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_11.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_14.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_16.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_18.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_22.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_25.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_27.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_29.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_30.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_31.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_36.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_37.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_38.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_40.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_41.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_42.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_43.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_44.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_45.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_47.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_49.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_51.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_52.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_53.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_54.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_54.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_55.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_56.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_56.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_57.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_59.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_61.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_63.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_63.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_64.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_66.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/waterfall_pattern_72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/waterfall_pattern_72.png


--------------------------------------------------------------------------------
/recommender/LTR_resources/emb_pattern_excerpt_overflow.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/LTR_resources/emb_pattern_excerpt_overflow.pkl


--------------------------------------------------------------------------------
/annotation-program/migrations/__pycache__/env.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/annotation-program/migrations/__pycache__/env.cpython-39.pyc


--------------------------------------------------------------------------------
/recommender/classification_model/binary_vectorizer_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/binary_vectorizer_model.sav


--------------------------------------------------------------------------------
/recommender/classification_model/multi_vectorizer_model.sav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/classification_model/multi_vectorizer_model.sav


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Personal Data Store.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Personal Data Store.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Psuedonymous Identity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Psuedonymous Identity.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Selective access control.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Selective access control.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Single Point of Contact.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Single Point of Contact.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Obtaining Explicit Consent.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Obtaining Explicit Consent.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Active broadcast of presence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Active broadcast of presence.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Attribute Based Credentials.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Attribute Based Credentials.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Protection against Tracking.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Protection against Tracking.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_[Support] Selective Disclosure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_[Support] Selective Disclosure.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Encryption with user-managed keys.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Encryption with user-managed keys.png


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Anonymous Reputation-based Blacklisting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Anonymous Reputation-based Blacklisting.png


--------------------------------------------------------------------------------
/recommender/README.md:
--------------------------------------------------------------------------------
1 | # Privacy Design Pattern Recommender
2 | 
3 | Run the recommender program by executing `python app.py`. 
4 | 
5 | ## Prerequisites
6 | Install the following Python libraries:
7 | - lightgbm
8 | - shap
9 | - flask


--------------------------------------------------------------------------------
/recommender/static/shap_plots/pattern_Decoupling [content] and location information visibility.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/recommender/static/shap_plots/pattern_Decoupling [content] and location information visibility.png


--------------------------------------------------------------------------------
/annotation-program/migrations/versions/__pycache__/164cfc37a367_added_similarity_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gunturbudi/pattern-recommender/main/annotation-program/migrations/versions/__pycache__/164cfc37a367_added_similarity_model.cpython-39.pyc


--------------------------------------------------------------------------------
/annotation-program/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Learning-to-Rank Privacy Design Pattern
 2 | 
 3 | Supplementary repository for "Learning to Rank Privacy Design Patterns: A Semantic Approach to Meeting Privacy Requirements" paper.
 4 | 
 5 | ## Repository Structure
 6 | - **annotation-program**: Contains annotation software and `annotation_result.json` for replicating relevance assessments.
 7 | - **classification**: Code for text classification essential for feature engineering in LTR.
 8 | - **letor**: Core code for the learning-to-rank process.
 9 | - **recommender**: Flask program interfacing the recommender system.
10 | 
11 | ## Training Data Access
12 | The training data, including Hadamard product and concatenation embedding, is available at [https://bit.ly/letor_priv](https://bit.ly/letor_priv) for download.


--------------------------------------------------------------------------------
/annotation-program/templates/login.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block title %}Welcome{% endblock %}
 4 | 
 5 | {% block content %}
 6 | 
 7 |     <div class="container">
 8 |         <h1>Login</h1>
 9 |         {% for message in get_flashed_messages() %}
10 |             <div class="alert alert-info">{{ message }}</div>
11 |         {% endfor %}
12 | 
13 |         <form method="post" action="{{ url_for('login') }}">
14 |             <div class="form-group">
15 |                 <label for="username">Username</label>
16 |                 <input type="text" class="form-control" id="username" name="username" required>
17 |             </div>
18 |             <div class="form-group">
19 |                 <label for="password">Password</label>
20 |                 <input type="password" class="form-control" id="password" name="password" required>
21 |             </div>
22 |             <input type="submit" class="btn btn-primary" value="Login">
23 |         </form>
24 |         <a href="{{ url_for('register') }}" class="btn btn-secondary">Register</a>
25 |     </div>
26 | 
27 | {% endblock %}


--------------------------------------------------------------------------------
/annotation-program/migrations/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # template used to generate migration files
 5 | # file_template = %%(rev)s_%%(slug)s
 6 | 
 7 | # set to 'true' to run the environment during
 8 | # the 'revision' command, regardless of autogenerate
 9 | # revision_environment = false
10 | 
11 | 
12 | # Logging configuration
13 | [loggers]
14 | keys = root,sqlalchemy,alembic,flask_migrate
15 | 
16 | [handlers]
17 | keys = console
18 | 
19 | [formatters]
20 | keys = generic
21 | 
22 | [logger_root]
23 | level = WARN
24 | handlers = console
25 | qualname =
26 | 
27 | [logger_sqlalchemy]
28 | level = WARN
29 | handlers =
30 | qualname = sqlalchemy.engine
31 | 
32 | [logger_alembic]
33 | level = INFO
34 | handlers =
35 | qualname = alembic
36 | 
37 | [logger_flask_migrate]
38 | level = INFO
39 | handlers =
40 | qualname = flask_migrate
41 | 
42 | [handler_console]
43 | class = StreamHandler
44 | args = (sys.stderr,)
45 | level = NOTSET
46 | formatter = generic
47 | 
48 | [formatter_generic]
49 | format = %(levelname)-5.5s [%(name)s] %(message)s
50 | datefmt = %H:%M:%S
51 | 


--------------------------------------------------------------------------------
/annotation-program/migrations/versions/164cfc37a367_added_similarity_model.py:
--------------------------------------------------------------------------------
 1 | """Added Similarity model
 2 | 
 3 | Revision ID: 164cfc37a367
 4 | Revises: 
 5 | Create Date: 2023-10-27 09:37:11.624391
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '164cfc37a367'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('similarity',
22 |     sa.Column('id', sa.Integer(), nullable=False),
23 |     sa.Column('inquery_id', sa.Integer(), nullable=False),
24 |     sa.Column('other_inquery_id', sa.Integer(), nullable=False),
25 |     sa.Column('score', sa.Float(), nullable=False),
26 |     sa.ForeignKeyConstraint(['inquery_id'], ['inquery.id'], ),
27 |     sa.PrimaryKeyConstraint('id')
28 |     )
29 |     # ### end Alembic commands ###
30 | 
31 | 
32 | def downgrade():
33 |     # ### commands auto generated by Alembic - please adjust! ###
34 |     op.drop_table('similarity')
35 |     # ### end Alembic commands ###
36 | 


--------------------------------------------------------------------------------
/annotation-program/templates/user_stats.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | <div class="container mt-5">
 5 |     <h1>Annotated Data</h1>
 6 |     <table class="table table-striped table-hover">
 7 |         <thead>
 8 |             <tr>
 9 |                 <th scope="col">#</th>
10 |                 <th scope="col">Query</th>
11 |                 <th scope="col">Candidates (Relevance)</th>
12 |                 <th scope="col">Timestamp</th>
13 |             </tr>
14 |         </thead>
15 |         <tbody>
16 |             {% for annotation in annotations_data %}
17 |                 <tr>
18 |                     <th scope="row">{{ loop.index }}</th>
19 |                     <td>{{ annotation.query }}</td>
20 |                     <td>
21 |                         {% for candidate, relevance in annotation.candidates %}
22 |                         {{ candidate }} ({{ relevance }}){% if not loop.last %}, {% endif %}
23 |                         {% endfor %}
24 |                     </td>
25 |                     <td>{{ annotation.timestamp }}</td>
26 |                 </tr>
27 |             {% endfor %}
28 |         </tbody>
29 |     </table>
30 | </div>
31 | {% endblock %}
32 | 


--------------------------------------------------------------------------------
/annotation-program/templates/tag.html:
--------------------------------------------------------------------------------
 1 | {% extends 'base.html' %}
 2 | 
 3 | {% block content %}
 4 | <div class="container mt-5">
 5 |     <h1>Tag: {{ tag.name }}</h1>
 6 |     <table class="table table-striped table-hover">
 7 |         <thead>
 8 |             <tr>
 9 |                 <th>Query Text</th>
10 |                 <th>Description</th>
11 |                 <th>Source</th>
12 |                 <th>Tags</th>
13 |             </tr>
14 |         </thead>
15 |         <tbody>
16 |             {% for candidate in candidates %}
17 |             <tr>
18 |                 <td style="text-align: left; width: 20%;">{{ candidate.text }}</td>
19 |                 <td style="text-align: left; width: 40%;">{{ candidate.description }}</td>
20 |                 <td style="text-align: left; width: 20%;"><a href="{{ candidate.source }}" target="_blank">source</a></td>
21 |                 <td style="text-align: left; width: 20%;">
22 |                     {% for tag in candidate.tags %}
23 |                     <a href="{{ url_for('tag', tag_name=tag.name) }}">{{ tag.name }}</a>{% if not loop.last %}, {% endif %}
24 |                     {% endfor %}
25 |                 </td>
26 |             </tr>
27 |             {% endfor %}
28 |         </tbody>
29 |     </table>
30 | </div>
31 | {% endblock %}
32 | 


--------------------------------------------------------------------------------
/annotation-program/templates/home.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block title %}
 4 |     Welcome
 5 | {% endblock %}
 6 | 
 7 | {% block content %}
 8 |     <div class="container mt-5">
 9 |         <div class="row">
10 |             <div class="col-12 text-center">
11 |                 <h1 class="display-4">Privacy Requirements Collection</h1>
12 |                 <p class="lead">Welcome to the comprehensive collection of privacy requirements and design patterns</p>
13 |             </div>
14 |         </div>
15 | 
16 |         {% for annotation in annotations_data %}
17 |             <div class="row mt-5">
18 |                 <div class="col-12">
19 |                     <h2>{{ annotation.source }}</h2>
20 |                     <a target="_blank" href="{{ url_for('annotate', source=annotation.source) }}" class="btn btn-success mt-4">Start Annotation for {{ annotation.source }}</a>
21 |                     <br/>
22 |                     <br/>
23 |                     <div class="progress mt-4">
24 |                         {% set progress = annotation.done / annotation.total * 100 %}
25 |                         <div class="progress-bar" role="progressbar" 
26 |                             style="width: {{ progress|round(2) }}%;" 
27 |                             aria-valuenow="{{ progress|round(2) }}" 
28 |                             aria-valuemax="100">
29 |                             {{ progress|round(2) }}%
30 |                         </div>
31 |                     </div>
32 |                     <p class="mt-2">You've completed {{ annotation.done }} out of {{ annotation.total }} annotations for source "{{ annotation.source }}".</p>
33 |                 </div>
34 |             </div>
35 |         {% endfor %}
36 |     </div>
37 | {% endblock %}
38 | 


--------------------------------------------------------------------------------
/annotation-program/templates/base.html:
--------------------------------------------------------------------------------
 1 | {% extends "bootstrap/base.html" %}
 2 | 
 3 | 
 4 | {% block navbar %}
 5 | <nav class="navbar navbar-default">
 6 |   <div class="container-fluid">
 7 |     <!-- Brand and toggle get grouped for better mobile display -->
 8 |     <div class="navbar-header">
 9 |       <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1" aria-expanded="false">
10 |         <span class="sr-only">Toggle navigation</span>
11 |         <span class="icon-bar"></span>
12 |         <span class="icon-bar"></span>
13 |         <span class="icon-bar"></span>
14 |       </button>
15 |       <a class="navbar-brand" href="/">PrivacyStory</a>
16 |     </div>
17 | 
18 |     <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
19 |       <ul class="nav navbar-nav">
20 |         <li {% if request.path == '/' %}class="active"{% endif %}><a href="/">Home</a></li>
21 |         <li {% if request.path == '/user_stats' %}class="active"{% endif %}><a href="/user_stats">Stats</a></li>
22 |         
23 |       </ul>
24 | 
25 |       {% if current_user.is_authenticated %}
26 |       <ul class="nav navbar-nav navbar-right">
27 |         <li>
28 |           <a href="/logout">Logout</a>
29 |         </li>
30 |       </ul>
31 |     {% endif %}
32 | 
33 | 
34 |       
35 |     </div><!-- /.navbar-collapse -->
36 |   </div><!-- /.container-fluid -->
37 | </nav>
38 | 
39 | {% with messages = get_flashed_messages(with_categories=true) %}
40 | {% if messages %}
41 | <div class="container mt-5">
42 |     {% for category, message in messages %}
43 |     <div class="alert alert-{{ category }}">
44 |         {{ message }}
45 |     </div>
46 |     {% endfor %}
47 | </div>
48 | {% endif %}
49 | {% endwith %}
50 | 
51 | {% endblock %}
52 | 
53 | 


--------------------------------------------------------------------------------
/annotation-program/templates/register.html:
--------------------------------------------------------------------------------
 1 | {% extends "base.html" %}
 2 | 
 3 | {% block title %}Welcome{% endblock %}
 4 | 
 5 | {% block content %}
 6 | 
 7 |     <div class="container">
 8 |         <h1>Register</h1>
 9 |         {% for message in get_flashed_messages() %}
10 |             <div class="alert alert-info">{{ message }}</div>
11 |         {% endfor %}
12 |         <form action="{{ url_for('register') }}" method="post">
13 | 
14 |             <div class="form-group">
15 |                 <label for="username">Username</label>
16 |                 <input type="text" class="form-control" id="username" name="username">
17 |             </div>            
18 |             <div class="form-group">
19 |                 <label for="email">Email</label>
20 |                 <input type="email" class="form-control" id="email" name="email">
21 |             </div>
22 |             <div class="form-group">
23 |                 <label for="password">Password</label>
24 |                 <input type="password" class="form-control" id="password" name="password">
25 |             </div>
26 |             <div class="form-group">
27 |                 <label for="password_confirmation">Password Confirmation</label>
28 |                 <input type="password" class="form-control" id="password_confirmation" name="password_confirmation">
29 |             </div>
30 |             <!-- <p class="starlabel_b" style="font-size:10pt"><i>We may contact you after the survey. If you agree to be contacted, please provide your email address. </i></p> -->
31 |             <div class="form-group form-check">
32 |                 <input type="checkbox" class="form-check-input" id="consent" name="consent">
33 |                 <label class="form-check-label" for="consent">I have read and agree to the <a href="{{ url_for('consent') }}" target="_blank"> informed consent form</a></label>
34 |             </div>
35 |             <button type="submit" class="btn btn-primary">Register</button>
36 |         </form>
37 |         <br>
38 |         <a href="{{ url_for('login') }}">Already have an account? Login</a>
39 |     </div>
40 | 
41 | {% endblock %}
42 | 
43 | 


--------------------------------------------------------------------------------
/letor/feature_importance.py:
--------------------------------------------------------------------------------
 1 | import lightgbm as lgb
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | base_path="train/"
 6 | 
 7 | # Function to read and extract feature indices from LightSVM formatted data
 8 | def get_feature_indices(files):
 9 |     all_features = set()
10 |     for file_name in files:
11 |         with open(base_path + file_name, 'r') as file:
12 |             for line in file:
13 |                 # Remove the comment part of the line if it exists
14 |                 line = line.split('#')[0].strip()
15 |                 # Skip the label and qid, then extract feature indices
16 |                 tokens = line.strip().split()[2:]  # Skip the label and qid
17 |                 features = {int(tok.split(':')[0]) for tok in tokens if ':' in tok}
18 |                 all_features.update(features)
19 |     return all_features
20 | 
21 | # Paths to your testing data files
22 | train_files = [
23 |     'test_fold_1.txt'
24 | ]
25 | 
26 | # Get all feature indices
27 | all_feature_indices = get_feature_indices(train_files)
28 | num_total_features = max(all_feature_indices)
29 | 
30 | # Load your models and calculate feature importances
31 | model_files = [
32 |     'model_fold_1.txt',
33 |     'model_fold_2.txt',
34 |     'model_fold_3.txt',
35 |     'model_fold_4.txt',
36 |     'model_fold_5.txt'
37 | ]
38 | 
39 | # Initialize a dictionary to store the feature importances from all folds
40 | feature_importances = {f'f{i}': [] for i in range(1, num_total_features + 1)}
41 | 
42 | # Load each model and gather the feature importances
43 | for model_file in model_files:
44 |     bst = lgb.Booster(model_file=base_path + model_file)  # Load the model
45 |     fold_importance = bst.feature_importance(importance_type='gain')
46 |     # Store the feature importances for the fold
47 |     for i, importance in enumerate(fold_importance, start=1):
48 |         feature_importances[f'f{i}'].append(importance)
49 | 
50 | # Calculate the average importance for each feature
51 | average_importances = {feature: np.mean(importances) for feature, importances in feature_importances.items()}
52 | 
53 | # Convert to a DataFrame for easier manipulation and saving to Excel
54 | importance_df = pd.DataFrame.from_dict(average_importances, orient='index', columns=['Average Importance'])
55 | importance_df.index.name = 'Feature'
56 | 
57 | # Sort the DataFrame by the feature importances
58 | importance_df = importance_df.sort_values(by='Average Importance', ascending=False)
59 | 
60 | # Save to an Excel file
61 | importance_df.to_excel(base_path + 'feature_importances.xlsx')
62 | 
63 | print("Feature importances have been calculated and saved to feature_importances.xlsx")
64 | 


--------------------------------------------------------------------------------
/recommender/predict_console.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import lightgbm as lgb
 4 | from feature_creation import PrivacyPatternFeatures
 5 | 
 6 | pp = PrivacyPatternFeatures()
 7 | 
 8 | with open("data/patterns.json", 'r') as p:
 9 |     patterns = json.loads(p.read())
10 | 
11 | pattern_name = [pattern["title"].replace(".md", "") for i, pattern in enumerate(patterns)]
12 | 
13 | with open("data/patterns_new.json", 'r') as p:
14 |     patterns_new = json.loads(p.read())
15 | 
16 |     for p_new in patterns_new:
17 |         patterns.append(p_new)
18 |         pattern_name.append(p_new["title"])
19 | 
20 | def process_new_requirement(new_req_text):
21 |     """
22 |     Process a new privacy requirement text to create a feature vector.
23 | 
24 |     Parameters:
25 |     - new_req_text (str): The new privacy requirement text.
26 |     - patterns (list): The list of patterns (already loaded from the patterns file).
27 |     - pp (PrivacyPatternFeatures): The PrivacyPatternFeatures instance.
28 |     - pattern_name (list): List of pattern names extracted from the patterns file.
29 | 
30 |     Returns:
31 |     - A list of feature vectors for the new requirement text.
32 |     """
33 |     features = pp.construct_features(new_req_text)
34 |     feature_vectors = []
35 | 
36 |     for idx, pattern in enumerate(patterns):
37 |         feature_vector = features[idx]  # Assuming features[idx] is already a list of features
38 |         feature_vectors.append(feature_vector)
39 | 
40 |     return feature_vectors
41 | 
42 | 
43 | def predict_new_data(model, new_req_text):
44 |     """
45 |     Predict the ranking for a new privacy requirement text using the trained model.
46 | 
47 |     Parameters:
48 |     - model: The trained LightGBM model.
49 |     - new_req_text (str): The new privacy requirement text.
50 | 
51 |     Returns:
52 |     - The predictions and the sorted pattern names based on their rankings.
53 |     """
54 | 
55 |     # Process the new requirement text
56 |     feature_vectors = process_new_requirement(new_req_text)
57 |     data_matrix = np.array(feature_vectors)
58 |     
59 |     # Make predictions using the model
60 |     predictions = model.predict(data_matrix)
61 |     
62 |     # Rank the pattern names based on predictions
63 |     sorted_indices = np.argsort(predictions)[::-1]
64 |     sorted_patterns = [pattern_name[i] for i in sorted_indices]
65 | 
66 |     return predictions, sorted_patterns
67 | 
68 | # Example usage:
69 | new_req_text = """
70 | 
71 | 
72 | If I have written a privacy tool as a .NET web application which is to be hosted on a commercial hosting site, other than hosting it in a privacy friendly country, how can I assure users that the application has not been compromised by a third party at the host?
73 | 
74 | Obviously SSL will be used and the assemblies will be as obfuscated as possible, but these can only go so far.
75 | 
76 | For example, is there a way I can ensure that my assemblies haven't been wrapped to intercept plain-text user details?
77 | 
78 | """
79 | model_file_path = "LTR_resources/model_fold_4_train_3_law.txt"  # Provide the correct path to your trained model
80 | 
81 | # Load the trained model
82 | bst = lgb.Booster(model_file=model_file_path)
83 | 
84 | # Assuming 'pp' (PrivacyPatternFeatures instance) and 'pattern_name' list are already defined in your environment.
85 | predictions, sorted_patterns = predict_new_data(bst, new_req_text)
86 | 
87 | # Print or process the predictions and sorted pattern names as needed
88 | print("Predictions:", predictions)
89 | print("Ranked Patterns:", sorted_patterns)
90 | 


--------------------------------------------------------------------------------
/letor/lightgbm_ltr_train.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import lightgbm as lgb
 3 | import pandas as pd
 4 | 
 5 | def parse_data(data):
 6 |     labels = []
 7 |     features = []
 8 |     qids = []
 9 |     for line in data.strip().split('\n'):
10 |         tokens = line.split()
11 |         labels.append(float(tokens[0]))
12 |         qids.append(int(tokens[1].split(':')[1]))
13 |         feat_vals = {}
14 |         for tok in tokens[2:]:
15 |             if ':' in tok:
16 |                 feat, val = tok.split(':')
17 |                 feat_vals[int(feat)] = float(val)
18 |         features.append(feat_vals)
19 |     return labels, features, qids
20 | 
21 | def create_dataset(features, labels, qids):
22 |     num_features = max([max(feat_vals.keys()) for feat_vals in features])
23 |     data = np.array([[feat_vals.get(feat, 0) for feat in range(1, num_features+1)] for feat_vals in features])
24 |     group = np.unique(qids, return_counts=True)[1]
25 |     return lgb.Dataset(data=data, label=labels, group=group, free_raw_data=False)
26 | 
27 | # Train and evaluate model
28 | def train_evaluate(train_file, test_file):
29 |     # Parse training data
30 |     with open(train_file, "r") as file:
31 |         train_labels, train_features, train_qids = parse_data(file.read())
32 |     
33 |     # Parse test data
34 |     with open(test_file, "r") as file:
35 |         test_labels, test_features, test_qids = parse_data(file.read())
36 |     
37 |     # Create datasets
38 |     train_dataset = create_dataset(train_features, train_labels, train_qids)
39 |     test_dataset = create_dataset(test_features, test_labels, test_qids)
40 |     test_dataset.reference = train_dataset
41 |     
42 |     # Parameters
43 |     params = {
44 |         'objective': 'lambdarank',
45 |         'metric': 'ndcg',
46 |         'ndcg_eval_at': list(range(1, 11)),
47 |         'learning_rate': 0.1,
48 |         'num_leaves': 31
49 |     }
50 |     num_round = 1000
51 |     
52 |     # Train model
53 |     bst = lgb.train(params, train_dataset, num_round, valid_sets=[test_dataset], valid_names=['test'])
54 |     
55 |     # Make predictions
56 |     max_feature_idx = max([max(feats.keys()) for feats in train_features])
57 |     test_data_matrix = np.array([[feat_vals.get(feat, 0) for feat in range(1, max_feature_idx+1)] for feat_vals in test_features])
58 |     test_preds = bst.predict(test_data_matrix)
59 |     
60 |     # Results
61 |     results = {f"ndcg@{i+1}": bst.best_score['test'][f'ndcg@{i+1}'] for i in range(10)}
62 |     
63 |     return results, bst
64 | 
65 | def generate_file_paths(fold_num, base_path="train_3"):
66 |     """Generate file paths for training, testing, and model based on fold number."""
67 |     train_file = f"{base_path}/train_fold_{fold_num}.txt"
68 |     test_file = f"{base_path}/test_fold_{fold_num}.txt"
69 |     model_file = f"{base_path}/model_fold_{fold_num}.txt"
70 |     return train_file, test_file, model_file
71 | 
72 | def perform_5_fold_train_test():
73 |     """Perform 5-fold training and testing, saving results and models."""
74 |     results_list = []
75 |     for i in range(1, 6):
76 |         train_file, test_file, model_file = generate_file_paths(i)
77 |         
78 |         results, bst = train_evaluate(train_file, test_file)
79 |         results_list.append(results)
80 |         bst.save_model(model_file)
81 |     
82 |     return results_list
83 | 
84 | # Execute 5-fold training/testing and save results
85 | results_list = perform_5_fold_train_test()
86 | 
87 | # Store results in Excel
88 | df = pd.DataFrame(results_list)
89 | results_path = "train_results.xlsx"
90 | df.to_excel(results_path, index=False)
91 | 


--------------------------------------------------------------------------------
/annotation-program/migrations/env.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from logging.config import fileConfig
  3 | 
  4 | from flask import current_app
  5 | 
  6 | from alembic import context
  7 | 
  8 | # this is the Alembic Config object, which provides
  9 | # access to the values within the .ini file in use.
 10 | config = context.config
 11 | 
 12 | # Interpret the config file for Python logging.
 13 | # This line sets up loggers basically.
 14 | fileConfig(config.config_file_name)
 15 | logger = logging.getLogger('alembic.env')
 16 | 
 17 | 
 18 | def get_engine():
 19 |     try:
 20 |         # this works with Flask-SQLAlchemy<3 and Alchemical
 21 |         return current_app.extensions['migrate'].db.get_engine()
 22 |     except (TypeError, AttributeError):
 23 |         # this works with Flask-SQLAlchemy>=3
 24 |         return current_app.extensions['migrate'].db.engine
 25 | 
 26 | 
 27 | def get_engine_url():
 28 |     try:
 29 |         return get_engine().url.render_as_string(hide_password=False).replace(
 30 |             '%', '%%')
 31 |     except AttributeError:
 32 |         return str(get_engine().url).replace('%', '%%')
 33 | 
 34 | 
 35 | # add your model's MetaData object here
 36 | # for 'autogenerate' support
 37 | # from myapp import mymodel
 38 | # target_metadata = mymodel.Base.metadata
 39 | config.set_main_option('sqlalchemy.url', get_engine_url())
 40 | target_db = current_app.extensions['migrate'].db
 41 | 
 42 | # other values from the config, defined by the needs of env.py,
 43 | # can be acquired:
 44 | # my_important_option = config.get_main_option("my_important_option")
 45 | # ... etc.
 46 | 
 47 | 
 48 | def get_metadata():
 49 |     if hasattr(target_db, 'metadatas'):
 50 |         return target_db.metadatas[None]
 51 |     return target_db.metadata
 52 | 
 53 | 
 54 | def run_migrations_offline():
 55 |     """Run migrations in 'offline' mode.
 56 | 
 57 |     This configures the context with just a URL
 58 |     and not an Engine, though an Engine is acceptable
 59 |     here as well.  By skipping the Engine creation
 60 |     we don't even need a DBAPI to be available.
 61 | 
 62 |     Calls to context.execute() here emit the given string to the
 63 |     script output.
 64 | 
 65 |     """
 66 |     url = config.get_main_option("sqlalchemy.url")
 67 |     context.configure(
 68 |         url=url, target_metadata=get_metadata(), literal_binds=True
 69 |     )
 70 | 
 71 |     with context.begin_transaction():
 72 |         context.run_migrations()
 73 | 
 74 | 
 75 | def run_migrations_online():
 76 |     """Run migrations in 'online' mode.
 77 | 
 78 |     In this scenario we need to create an Engine
 79 |     and associate a connection with the context.
 80 | 
 81 |     """
 82 | 
 83 |     # this callback is used to prevent an auto-migration from being generated
 84 |     # when there are no changes to the schema
 85 |     # reference: http://alembic.zzzcomputing.com/en/latest/cookbook.html
 86 |     def process_revision_directives(context, revision, directives):
 87 |         if getattr(config.cmd_opts, 'autogenerate', False):
 88 |             script = directives[0]
 89 |             if script.upgrade_ops.is_empty():
 90 |                 directives[:] = []
 91 |                 logger.info('No changes in schema detected.')
 92 | 
 93 |     conf_args = current_app.extensions['migrate'].configure_args
 94 |     if conf_args.get("process_revision_directives") is None:
 95 |         conf_args["process_revision_directives"] = process_revision_directives
 96 | 
 97 |     connectable = get_engine()
 98 | 
 99 |     with connectable.connect() as connection:
100 |         context.configure(
101 |             connection=connection,
102 |             target_metadata=get_metadata(),
103 |             **conf_args
104 |         )
105 | 
106 |         with context.begin_transaction():
107 |             context.run_migrations()
108 | 
109 | 
110 | if context.is_offline_mode():
111 |     run_migrations_offline()
112 | else:
113 |     run_migrations_online()
114 | 


--------------------------------------------------------------------------------
/recommender/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>Privacy Requirement Predictor</title>
  7 |     <link href="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css" rel="stylesheet">
  8 |     <style>
  9 |         .result-container {
 10 |             transition: background-color 0.3s ease;
 11 |         }
 12 |         .result-container:hover {
 13 |             background-color: #f8f9fa; /* Light grey background on hover */
 14 |         }
 15 |         h3 {
 16 |             color: #333; /* Dark grey color for titles */
 17 |         }
 18 |         img.img-fluid {
 19 |             border-radius: 0.25rem; /* Slightly round the corners of the images */
 20 |             transition: transform 0.3s ease;
 21 |         }
 22 |         img.img-fluid:hover {
 23 |             transform: scale(1.05); /* Slightly scale up the image on hover */
 24 |         }
 25 |     </style>
 26 |     
 27 | </head>
 28 | <body>
 29 |     <div class="container">
 30 |         <div class="form-group">
 31 |             <label for="requirement">Requirement Text</label>
 32 |             <textarea class="form-control" id="requirement" name="requirement" rows="4"></textarea>
 33 |         </div>
 34 |         <button id="submitBtn" class="btn btn-primary">Recommend Design Patterns</button>
 35 |         <!-- Loading Indicator -->
 36 |         <div id="loading" class="spinner-border text-primary" role="status" style="display: none;">
 37 |             <span class="sr-only">Loading...</span>
 38 |         </div>
 39 |         <!-- Result Section -->
 40 |         <div id="results" class="mt-3">
 41 |             <!-- SHAP visualizations will be inserted here -->
 42 |         </div>
 43 |     </div>
 44 | 
 45 |     <script src="https://code.jquery.com/jquery-3.5.1.min.js"></script>
 46 |     <script>
 47 |         $(document).ready(function() {
 48 |             $("#submitBtn").click(function() {
 49 |                 // Show the loading indicator
 50 |                 $("#loading").show();
 51 |                 var reqText = $("#requirement").val();
 52 |                 $.ajax({
 53 |                     type: "POST",
 54 |                     url: "/predict",
 55 |                     data: { requirement: reqText },
 56 |                     success: function(response) {
 57 |                         $("#results").empty(); // Clear current results
 58 |                         response.sorted_patterns.forEach(function(pattern) {
 59 |                             // Create a container for the result and SHAP plot
 60 |                             var resultContainer = $("<div class='result-container bg-white my-3 p-3 border rounded shadow-sm'></div>");
 61 |                             var title = $("<h3 class='mb-2'></h3>").text(pattern.pattern); // mb-2 for a little margin bottom
 62 |                             var desc = $("<p class='mb-2'></p>").text(pattern.excerpt); // mb-2 for a little margin bottom
 63 |                             // Insert the SHAP plot image with responsive class 'img-fluid' and a max-height
 64 |                             var shapPlotImg = $("<img>").attr({
 65 |                                 "src": pattern.shap_plot_path,
 66 |                                 "class": "img-fluid",
 67 |                                 "style": "max-height: 400px; display: block; margin-left: auto; margin-right: auto;" // Center image
 68 |                             });
 69 | 
 70 |                             // Add the SHAP waterfall plot image
 71 |                             var waterfallPlotImg = $("<img>").attr({
 72 |                                 "src": pattern.shap_waterfall_plot_path,
 73 |                                 "class": "img-fluid",
 74 |                                 "style": "max-height: 400px; display: block; margin-left: auto; margin-right: auto;" // Center image
 75 |                             });
 76 | 
 77 |                             resultContainer.append(title, desc, shapPlotImg);
 78 |                             resultContainer.append(waterfallPlotImg);
 79 | 
 80 |                             $("#results").append(resultContainer);
 81 |                         });
 82 |                         // Hide the loading indicator
 83 |                         $("#loading").hide();
 84 |                     },
 85 |                     error: function(error) {
 86 |                         console.error("Error fetching results", error);
 87 |                         // Hide the loading indicator
 88 |                         $("#loading").hide();
 89 |                     }
 90 |                 });
 91 |             });
 92 |         });
 93 |     </script>
 94 |     
 95 | 
 96 |     <!-- Include SHAP's dependent JavaScript libraries -->
 97 |     <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.16.2/dist/umd/popper.min.js"></script>
 98 |     <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
 99 |     <!-- You may need to include additional JS for rendering SHAP plots -->
100 | </body>
101 | </html>
102 | 


--------------------------------------------------------------------------------
/recommender/data/patterns_new.json:
--------------------------------------------------------------------------------
 1 | [{
 2 |       "title": "Federated Learning",
 3 |       "excerpt": "Machine learning technique that trains an algorithm across multiple decentralized edge devices or servers holding local data samples, without exchanging them.",
 4 |       "description": "Federated learning (also known as collaborative learning) is a machine learning technique that trains an algorithm across multiple decentralized edge devices or servers holding local data samples, without exchanging them. This approach stands in contrast to traditional centralized machine learning techniques where all the local datasets are uploaded to one server, as well as to more classical decentralized approaches which often assume that local data samples are identically distributed. Federated learning enables multiple actors to build a common, robust machine learning model without sharing data, thus allowing to address critical issues such as data privacy, data security, data access rights and access to heterogeneous data. Its applications are spread over a number of industries including defense, telecommunications, IoT, and pharmaceutics. A major open question at the moment is how inferior models learned through federated data are relative to ones where the data are pooled. Another open question concerns the trustworthiness of the edge devices and the impact of malicious actors on the learned model. Federated learning aims at training a machine learning algorithm, for instance deep neural networks, on multiple local datasets contained in local nodes without explicitly exchanging data samples. The general principle consists in training local models on local data samples and exchanging parameters (e.g. the weights and biases of a deep neural network) between these local nodes at some frequency to generate a global model shared by all nodes. The main difference between federated learning and distributed learning lies in the assumptions made on the properties of the local datasets,[1] as distributed learning originally aims at parallelizing computing power where federated learning originally aims at training on heterogeneous datasets. While distributed learning also aims at training a single model on multiple servers, a common underlying assumption is that the local datasets are independent and identically distributed (i.i.d.) and roughly have the same size. None of these hypotheses are made for federated learning; instead, the datasets are typically heterogeneous and their sizes may span several orders of magnitude. Moreover, the clients involved in federated learning may be unreliable as they are subject to more failures or drop out since they commonly rely on less powerful communication media (i.e. Wi-Fi) and battery-powered systems (i.e. smartphones and IoT devices) compared to distributed learning where nodes are typically datacenters that have powerful computational capabilities and are connected to one another with fast networks. In the centralized federated learning setting, a central server is used to orchestrate the different steps of the algorithms and coordinate all the participating nodes during the learning process. The server is responsible for the nodes selection at the beginning of the training process and for the aggregation of the received model updates. Since all the selected nodes have to send updates to a single entity, the server may become a bottleneck of the system. In the decentralized federated learning setting, the nodes are able to coordinate themselves to obtain the global model. This setup prevents single point failures as the model updates are exchanged only between interconnected nodes without the orchestration of the central server. Nevertheless, the specific network topology may affect the performances of the learning process.[2] See blockchain-based federated learning[3] and the references therein. An increasing number of application domains involve a large set of heterogeneous clients, e.g., mobile phones and IoT devices.[4] Most of the existing Federated learning strategies assume that local models share the same global model architecture. Recently, a new federated learning framework named HeteroFL was developed to address heterogeneous clients equipped with very different computation and communication capabilities.[5] The HeteroFL technique can enable the training of heterogeneous local models with dynamically varying computation and non-iid data complexities while still producing a single accurate global inference model.",
 5 |       "source": "https://en.wikipedia.org/wiki/Federated_learning"
 6 |    },
 7 |    {
 8 |       "title": "FIDO Authentication",
 9 |       "excerpt": "The FIDO Alliance is involved in three areas to work towards achieving its mission to reduce the world’s reliance on passwords to better secure the web: user authentication; identity verification and binding; and the Internet of Things (IoT)",
10 |       "description": "The FIDO Alliance is involved in three areas to work towards achieving its mission to reduce the world’s reliance on passwords to better secure the web: user authentication; identity verification and binding; and the Internet of Things (IoT). The work areas address essential aspects of the digital identity lifecycle management including identity verification for initial account onboarding and account recovery, and user and device authentication. Passwords endure despite the growing consensus their use needs to be reduced, if not replaced. But even though effective PKI and strong authentication solutions have existed for years, barriers to widespread adoption persist. Consumers don’t like the user experience, and online service providers don’t want the cost and complexity of developing and provisioning their own dedicated solutions. The industry’s answer to the password problem. The FIDO Alliance developed FIDO Authentication standards based on public key cryptography for authentication that is more secure than passwords and SMS OTPs, simpler for consumers to use, and easier for service providers to deploy and manage. FIDO Authentication enables password-only logins to be replaced with secure and fast login experiences across websites and apps.",
11 |       "source": "https://fidoalliance.org/fido-authentication/"
12 |    }
13 | ]


--------------------------------------------------------------------------------
/letor/compute_shap.py:
--------------------------------------------------------------------------------
  1 | import shap
  2 | import lightgbm as lgb
  3 | import pandas as pd
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | 
  7 | # Assuming 'model_fold_1.txt' is your model file and 'test_fold_1.txt' is your test data file
  8 | PARENT_PATH = "resultfold/train_1/"
  9 | model_path = PARENT_PATH + 'model_fold_4.txt'
 10 | test_path = PARENT_PATH + 'test_fold_4.txt'
 11 | 
 12 | 
 13 | def define_feature_names():
 14 |     # Generate feature names for the first 4 features
 15 |     feature_names = [
 16 |         "Covered Words", "Covered Words Ratio",
 17 |         "Length of Query", "IDF of Query"
 18 |     ]
 19 | 
 20 |     # Generate feature names for TF features (5-14)
 21 |     tf_feature_names = [
 22 |         "TF Sum", "TF Min", "TF Max", "TF Average", "TF Variance",
 23 |         "Normalized TF Sum", "Normalized TF Min", "Normalized TF Max", "Normalized TF Average", "Normalized TF Variance"
 24 |     ]
 25 |     feature_names.extend(tf_feature_names)
 26 | 
 27 |     # Generate feature names for TF-IDF features (15-19)
 28 |     tf_idf_feature_names = [
 29 |         "TF-IDF Sum", "TF-IDF Min", "TF-IDF Max", "TF-IDF Average", "TF-IDF Variance"
 30 |     ]
 31 |     feature_names.extend(tf_idf_feature_names)
 32 |     
 33 |     feature_names.extend([
 34 |         "BM25", "Content Similarity #1", "Title Similarity #1", "Excerpt Similarity #1",
 35 |         "Content Similarity #2", "Title Similarity #2", "Excerpt Similarity #2",
 36 |         "Binary Query", "Multi Query", "Binary Pattern", "Multi Pattern",
 37 |         "Content Similarity #1.1", "Title Similarity #1.1", "Excerpt Similarity #1.1",
 38 |         "Content Similarity #2.1", "Title Similarity #2.1", "Excerpt Similarity #2.1"
 39 |     ])
 40 |     
 41 |     # return here for train_1
 42 |     # return feature_names
 43 | 
 44 |     # Define the naming for hadamard product features
 45 |     hadamard_feature_sets = [
 46 |         "Hadamard Content #1", "Hadamard Title #1", "Hadamard Excerpt #1",
 47 |         "Hadamard Content #2", "Hadamard Title #2", "Hadamard Excerpt #2"
 48 |     ]
 49 |     
 50 | 
 51 |     # Generate feature names for the hadamard product features
 52 |     for feature_set_name in hadamard_feature_sets:
 53 |         for i in range(1, 769):  # Each set has 768 features
 54 |             feature_names.append(f"{feature_set_name} {i}")
 55 | 
 56 |     # return here for train_2
 57 |     # return feature_names
 58 | 
 59 |     # Define the naming for concatenation features
 60 |     concat_feature_sets = [
 61 |         "Concat Content #1", "Concat Title #1", "Concat Excerpt #1",
 62 |         "Concat Content #2", "Concat Title #2", "Concat Excerpt #2"
 63 |     ]
 64 | 
 65 |     # Generate feature names for the concatenation features
 66 |     for feature_set_name in concat_feature_sets:
 67 |         for i in range(1, 769):  # Each set has 768 features
 68 |             feature_names.append(f"{feature_set_name} {i}")
 69 |             
 70 |     # return here for train_3
 71 |     return feature_names
 72 | 
 73 | 
 74 | feature_names = define_feature_names()
 75 | 
 76 | # Define a function to parse the data, as it's used for both training and test data
 77 | def parse_data(data):
 78 |     labels = []
 79 |     features = []
 80 |     qids = []
 81 |     for line in data.strip().split('\n'):
 82 |         tokens = line.split()
 83 |         labels.append(float(tokens[0]))
 84 |         qids.append(int(tokens[1].split(':')[1]))
 85 |         feat_vals = {}
 86 |         for tok in tokens[2:]:
 87 |             if ':' in tok:
 88 |                 feat, val = tok.split(':')
 89 |                 feat_vals[int(feat)] = float(val)
 90 |         features.append(feat_vals)
 91 |     return labels, features, qids
 92 | 
 93 | 
 94 | # Load the trained model
 95 | bst = lgb.Booster(model_file=model_path)
 96 | 
 97 | # Manually set the objective parameter if it's not present
 98 | if 'objective' not in bst.params:
 99 |     bst.params['objective'] = 'lambdarank'
100 | 
101 | # Parse test data
102 | with open(test_path, "r") as file:
103 |     test_labels, test_features, test_qids = parse_data(file.read())
104 | 
105 | # Prepare test data matrix
106 | max_feature_idx = max([max(feats.keys()) for feats in test_features])
107 | test_data_matrix = np.array([[feat_vals.get(feat, 0) for feat in range(1, max_feature_idx+1)] for feat_vals in test_features])
108 | 
109 | # Create SHAP explainer
110 | explainer = shap.TreeExplainer(bst)
111 | 
112 | # Compute SHAP values
113 | shap_values = explainer.shap_values(test_data_matrix)
114 | 
115 | shap_explanation = shap.Explanation(values=shap_values[0], 
116 |                                     base_values=explainer.expected_value, 
117 |                                     data=test_data_matrix[0], feature_names=feature_names)
118 | 
119 | # Save the SHAP summary bar plot with the top 10 features to a file
120 | shap.summary_plot(shap_values, test_data_matrix, plot_type='bar', feature_names=feature_names, show=False, max_display=10)
121 | plt.savefig('ltr_shap_summary_bar_top10_named.png')
122 | plt.close()
123 | 
124 | # Save the SHAP beeswarm plot with the top 10 features to a file
125 | shap.summary_plot(shap_values, test_data_matrix, show=False, feature_names=feature_names, max_display=10)
126 | plt.savefig('ltr_shap_summary_beeswarm_top10_named.png')
127 | plt.close()
128 | 
129 | 
130 | # Save the SHAP dependence plot for a specific feature (e.g., Feature 1) to a file
131 | shap.dependence_plot(21, shap_values, test_data_matrix, feature_names=feature_names, show=False)
132 | plt.savefig('ltr_shap_dependence_plot_feature_22.png')
133 | plt.close()
134 | 
135 | # Save the SHAP dependence plot for a specific feature (e.g., Feature 1) to a file
136 | # shap.dependence_plot(597, shap_values, test_data_matrix, feature_names=feature_names, show=False)
137 | # plt.savefig('ltr_shap_dependence_plot_feature_597.png')
138 | # plt.close()
139 | 
140 | # Save the SHAP waterfall plot for the first prediction to a file
141 | shap_waterfall_plot = plt.figure()
142 | shap.plots.waterfall(shap_explanation, max_display=10)
143 | shap_waterfall_plot.savefig('ltr_shap_waterfall_plot_named.png')
144 | plt.close(shap_waterfall_plot)
145 |     
146 |     
147 | 


--------------------------------------------------------------------------------
/letor/case_studies.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import numpy as np
  3 | from rich.console import Console
  4 | from rich.table import Table
  5 | 
  6 | def dcg_at_k(r: List[int], k: int) -> float:
  7 |     """Discounted Cumulative Gain at rank k."""
  8 |     r = np.asfarray(r)[:k]
  9 |     return np.sum(r / np.log2(np.arange(2, r.size + 2)))
 10 | 
 11 | def ndcg_at_k(r: List[int], k: int) -> float:
 12 |     """Normalized Discounted Cumulative Gain at rank k."""
 13 |     dcg_max = dcg_at_k(sorted(r, reverse=True), k)
 14 |     if not dcg_max:
 15 |         return 0.
 16 |     return dcg_at_k(r, k) / dcg_max
 17 | 
 18 | def calculate_metrics(recommendations: List[List[str]], ideal: List[List[str]]) -> dict:
 19 |     max_k = 5  # Maximum rank for NDCG calculations
 20 |     ndcg_scores = {k: [] for k in range(1, max_k + 1)}
 21 |     ndcg_details = []
 22 | 
 23 |     for rec, ideal_rec in zip(recommendations, ideal):
 24 |         # Convert to graded relevance
 25 |         relevance = []
 26 |         for i in range(len(rec)):
 27 |             if rec[i] in ideal_rec:
 28 |                 pos_diff = abs(ideal_rec.index(rec[i]) - i)
 29 |                 relevance_score = max(5 - pos_diff, 1)
 30 |             else:
 31 |                 relevance_score = 0
 32 |             relevance.append(relevance_score)
 33 | 
 34 |         row_ndcg = {}
 35 |         for k in range(1, max_k + 1):
 36 |             ndcg_score = ndcg_at_k(relevance, k)
 37 |             ndcg_scores[k].append(ndcg_score)
 38 |             row_ndcg[f'NDCG@{k}'] = ndcg_score
 39 |         ndcg_details.append(row_ndcg)
 40 | 
 41 |     metrics = {
 42 |         "mean_ndcg": {k: np.mean(ndcg_scores[k]) for k in ndcg_scores},
 43 |         "ndcg_details": ndcg_details
 44 |     }
 45 | 
 46 |     return metrics
 47 | 
 48 | def find_best_ndcg_rows(ndcg_details):
 49 |     best_rows = {}
 50 |     for k in range(1, 6):  # For each NDCG rank from 1 to 5
 51 |         best_score = -1
 52 |         best_row_index = -1
 53 |         for i, row in enumerate(ndcg_details):
 54 |             if row[f'NDCG@{k}'] > best_score:
 55 |                 best_score = row[f'NDCG@{k}']
 56 |                 best_row_index = i
 57 |         best_rows[f'best_row_for_ndcg@{k}'] = (best_row_index, best_score)
 58 |     return best_rows
 59 | 
 60 | def sort_ndcg_rows(ndcg_details):
 61 |     sorted_rows = {}
 62 |     for k in range(1, 6):  # For each NDCG rank from 1 to 5
 63 |         rows_with_scores = [(i, row[f'NDCG@{k}']) for i, row in enumerate(ndcg_details)]
 64 |         rows_with_scores.sort(key=lambda x: x[1], reverse=True)  # Sort by NDCG score, highest first
 65 |         sorted_rows[f'ndcg@{k}_sorted'] = rows_with_scores
 66 |     return sorted_rows
 67 | 
 68 | def display_sorted_ndcg_rows(sorted_ndcg_rows):
 69 |     console = Console()
 70 |     for k in sorted_ndcg_rows:
 71 |         table = Table(show_header=True, header_style="bold magenta")
 72 |         table.add_column("Row Index", style="dim")
 73 |         table.add_column(f"NDCG@{k[-1]}", justify="right")
 74 | 
 75 |         for row_index, score in sorted_ndcg_rows[k]:
 76 |             table.add_row(str(row_index), f"{score:.4f}")
 77 | 
 78 |         console.print(f"Sorted Rows for {k.upper()}")
 79 |         console.print(table)
 80 |         
 81 |         
 82 | def calculate_average_ndcg_per_row(ndcg_details):
 83 |     average_ndcg_scores = []
 84 |     for row in ndcg_details:
 85 |         average_score = sum(row[f'NDCG@{k}'] for k in range(1, 6)) / 5
 86 |         average_ndcg_scores.append(average_score)
 87 |     return average_ndcg_scores
 88 | 
 89 | def display_sorted_average_ndcg_rows(average_ndcg_scores):
 90 |     sorted_average_scores = sorted(enumerate(average_ndcg_scores), key=lambda x: x[1], reverse=True)
 91 |     
 92 |     console = Console()
 93 |     table = Table(show_header=True, header_style="bold magenta")
 94 |     table.add_column("Row Index", style="dim")
 95 |     table.add_column("Average NDCG Score", justify="right")
 96 | 
 97 |     for row_index, score in sorted_average_scores:
 98 |         table.add_row(str(row_index), f"{score:.4f}")
 99 | 
100 |     console.print("Rows Sorted by Average NDCG Score (Best to Worst)")
101 |     console.print(table)
102 |     
103 | recommendations = [
104 |     ["Attribute Based Credentials", "Psuedonymous Identity", "Location Granularity", 
105 |      "Decoupling content and location information visibility", "Onion Routing"],
106 |     ["Psuedonymous Identity", "Attribute Based Credentials", "Obtaining Explicit Consent", 
107 |      "Onion Routing", "Protection against Tracking"],
108 |     ["Support Selective Disclosure", "Pseudonymous Messaging", "Psuedonymous Identity", 
109 |      "Attribute Based Credentials", "Added-noise measurement obfuscation"],
110 |     ["Attribute Based Credentials", "Active broadcast of presence", "Added-noise measurement obfuscation", 
111 |      "Awareness Feed", "Personal Data Store"],
112 |     ["Strip Invisible Metadata", "Added-noise measurement obfuscation", "Lawful Consent", 
113 |      "Obtaining Explicit Consent", "Attribute Based Credentials"],
114 |     ["Active broadcast of presence", "Privacy dashboard", "Attribute Based Credentials", 
115 |      "Abridged Terms and Conditions", "Dynamic Privacy Policy Display"],
116 |     ["Lawful Consent", "Attribute Based Credentials", "Awareness Feed", 
117 |      "Selective access control", "Informed Implicit Consent"]
118 | ]
119 | 
120 | recommendations_ideal = [
121 |      ["Location Granularity", "Decoupling content and location information visibility", "Psuedonymous Identity", "Attribute Based Credentials", "O"],
122 |      ["Protection against Tracking", "Psuedonymous Identity", "Attribute Based Credentials", "Onion Routing", "O"],
123 |      ["Support Selective Disclosure", "Pseudonymous Messaging", "Psuedonymous Identity", 
124 |      "Attribute Based Credentials", "Added-noise measurement obfuscation"],
125 |      ["Attribute Based Credentials", "Added-noise measurement obfuscation", "Active broadcast of presence", "Awareness Feed", "Personal Data Store"],
126 |     ["Strip Invisible Metadata", "Added-noise measurement obfuscation", "O", 
127 |      "O", "Attribute Based Credentials"],
128 |     ["Active broadcast of presence", "Abridged Terms and Conditions", "Privacy dashboard", "Dynamic Privacy Policy Display", "O"],
129 |     ["Lawful Consent", "Awareness Feed", 
130 |      "Informed Implicit Consent", "Selective access control", "O"]
131 | ]
132 | 
133 | # Example usage
134 | metrics = calculate_metrics(recommendations, recommendations_ideal)
135 | sorted_ndcg_rows = sort_ndcg_rows(metrics['ndcg_details'])
136 | best_ndcg_rows = find_best_ndcg_rows(metrics['ndcg_details'])
137 | 
138 | print("Metrics:", metrics)
139 | print("==="*5)
140 | print("Best NDCG rows:", best_ndcg_rows)
141 | print("==="*5)
142 | display_sorted_ndcg_rows(sorted_ndcg_rows)
143 | 
144 | average_ndcg_scores = calculate_average_ndcg_per_row(metrics['ndcg_details'])
145 | 
146 | # Print using rich console
147 | display_sorted_average_ndcg_rows(average_ndcg_scores)


--------------------------------------------------------------------------------
/recommender/app.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, render_template, request, jsonify, url_for
  2 | import json
  3 | import numpy as np
  4 | import lightgbm as lgb
  5 | from feature_creation import PrivacyPatternFeatures
  6 | import os
  7 | import shap
  8 | import matplotlib.pyplot as plt
  9 | 
 10 | app = Flask(__name__)
 11 | pp = PrivacyPatternFeatures()
 12 | 
 13 | # Load the patterns and the trained model
 14 | with open("data/patterns.json", 'r') as p:
 15 |     patterns = json.load(p)
 16 | 
 17 | pattern_name = [pattern["title"].replace(".md", "") for pattern in patterns]
 18 | 
 19 | # Load any additional patterns
 20 | with open("data/patterns_new.json", 'r') as p:
 21 |     patterns_new = json.load(p)
 22 | 
 23 | for p_new in patterns_new:
 24 |     patterns.append(p_new)
 25 |     pattern_name.append(p_new["title"])
 26 | 
 27 | model_file_path = "LTR_resources/model_fold_4_train_3.txt"  # the path to LeToR trained model on LightGBM
 28 | if os.path.exists(model_file_path):
 29 |     bst = lgb.Booster(model_file=model_file_path)
 30 |     bst.params['objective'] = 'lambdarank'
 31 | else:
 32 |     raise FileNotFoundError("Model file not found.")
 33 | 
 34 | @app.route('/')
 35 | def index():
 36 |     return render_template('index.html')
 37 | 
 38 | @app.route('/predict', methods=['POST'])
 39 | def predict():
 40 |     new_req_text = request.form['requirement']
 41 |     feature_vectors = process_new_requirement(new_req_text)
 42 |     data_matrix = np.array(feature_vectors)
 43 | 
 44 |     # Get sorted patterns with SHAP plots
 45 |     sorted_patterns_with_shap = predict_new_data(bst, data_matrix)
 46 | 
 47 |     # Get indices of sorted patterns from original pattern list
 48 |     sorted_pattern_indices = [pattern_name.index(pattern[0]) for pattern in sorted_patterns_with_shap]
 49 | 
 50 |     # Use these indices to get the correct excerpts and the SHAP image paths
 51 |     patterns_with_desc_and_shap = [
 52 |         {
 53 |             "pattern": sorted_patterns_with_shap[i][0],
 54 |             "excerpt": patterns[sorted_pattern_indices[i]]["excerpt"],
 55 |             "shap_plot_path": sorted_patterns_with_shap[i][2],  # SHAP plot image path
 56 |             "shap_waterfall_plot_path": sorted_patterns_with_shap[i][3]  # SHAP plot image path
 57 |         }
 58 |         for i in range(len(sorted_patterns_with_shap))
 59 |     ]
 60 | 
 61 |     # Return JSON response with paths to SHAP plot images
 62 |     return jsonify(sorted_patterns=patterns_with_desc_and_shap)
 63 | 
 64 | def get_feature_names():
 65 |     # Generate feature names for the first 4 features
 66 |     feature_names = [
 67 |         "Covered Words", "Covered Words Ratio",
 68 |         "Length of Query", "IDF of Query"
 69 |     ]
 70 | 
 71 |     # Generate feature names for TF features (5-14)
 72 |     tf_feature_names = [
 73 |         "TF Sum", "TF Min", "TF Max", "TF Average", "TF Variance",
 74 |         "Normalized TF Sum", "Normalized TF Min", "Normalized TF Max", "Normalized TF Average", "Normalized TF Variance"
 75 |     ]
 76 |     feature_names.extend(tf_feature_names)
 77 | 
 78 |     # Generate feature names for TF-IDF features (15-19)
 79 |     tf_idf_feature_names = [
 80 |         "TF-IDF Sum", "TF-IDF Min", "TF-IDF Max", "TF-IDF Average", "TF-IDF Variance"
 81 |     ]
 82 |     feature_names.extend(tf_idf_feature_names)
 83 |     
 84 |     feature_names.extend([
 85 |         "BM25", "Content Similarity #1", "Title Similarity #1", "Excerpt Similarity #1",
 86 |         "Content Similarity #2", "Title Similarity #2", "Excerpt Similarity #2",
 87 |         "Binary Query", "Multi Query", "Binary Pattern", "Multi Pattern",
 88 |         "Content Similarity #1.1", "Title Similarity #1.1", "Excerpt Similarity #1.1",
 89 |         "Content Similarity #2.1", "Title Similarity #2.1", "Excerpt Similarity #2.1"
 90 |     ])
 91 |     
 92 |     # return here for train_1
 93 |     # return feature_names
 94 | 
 95 |     # Define the naming for hadamard product features
 96 |     hadamard_feature_sets = [
 97 |         "Hadamard Content #1", "Hadamard Title #1", "Hadamard Excerpt #1",
 98 |         "Hadamard Content #2", "Hadamard Title #2", "Hadamard Excerpt #2"
 99 |     ]
100 |     
101 | 
102 |     # Generate feature names for the hadamard product features
103 |     for feature_set_name in hadamard_feature_sets:
104 |         for i in range(1, 769):  # Each set has 768 features
105 |             feature_names.append(f"{feature_set_name} {i}")
106 | 
107 |     # return here for train_2
108 |     # return feature_names
109 | 
110 |     # Define the naming for concatenation features
111 |     concat_feature_sets = [
112 |         "Concat Content #1", "Concat Title #1", "Concat Excerpt #1",
113 |         "Concat Content #2", "Concat Title #2", "Concat Excerpt #2"
114 |     ]
115 | 
116 |     # Generate feature names for the concatenation features
117 |     for feature_set_name in concat_feature_sets:
118 |         for i in range(1, 769):  # Each set has 768 features
119 |             feature_names.append(f"{feature_set_name} {i}")
120 |             
121 |     # return here for train_3
122 |     return feature_names
123 | 
124 | def process_new_requirement(new_req_text):
125 |     """
126 |     Process a new privacy requirement text to create a feature vector.
127 | 
128 |     Parameters:
129 |     - new_req_text (str): The new privacy requirement text.
130 |     - patterns (list): The list of patterns (already loaded from the patterns file).
131 |     - pp (PrivacyPatternFeatures): The PrivacyPatternFeatures instance.
132 |     - pattern_name (list): List of pattern names extracted from the patterns file.
133 | 
134 |     Returns:
135 |     - A list of feature vectors for the new requirement text.
136 |     """
137 |     features = pp.construct_features(new_req_text)
138 |     feature_vectors = []
139 | 
140 |     for idx, pattern in enumerate(patterns):
141 |         feature_vector = features[idx]  # Assuming features[idx] is already a list of features
142 |         feature_vectors.append(feature_vector)
143 | 
144 |     return feature_vectors
145 | 
146 | def predict_new_data(model, feature_vectors):
147 |     print(len(feature_vectors))
148 |     # Make predictions using the model
149 |     predictions = model.predict(feature_vectors)
150 |     
151 | 
152 |     # Rank the pattern names based on predictions and select only the top 7
153 |     sorted_indices = np.argsort(predictions)[::-1][:7]
154 |     top_sorted_patterns = [pattern_name[i] for i in sorted_indices]
155 | 
156 |     # Generate SHAP values
157 |     explainer = shap.TreeExplainer(model)
158 |     shap_values = explainer.shap_values(feature_vectors)
159 | 
160 |     # Generate SHAP force plot images for each prediction
161 |     shap_image_paths = []
162 |     shap_waterfall_image_paths = []
163 |     for i in sorted_indices:
164 |         print(shap_values[i])
165 |         print(feature_vectors[i])
166 |         
167 |         # FORCE PLOT
168 |         plt.figure()
169 |         shap.force_plot(
170 |             explainer.expected_value, shap_values[i], feature_vectors[i],
171 |             feature_names=get_feature_names(), matplotlib=True, show=False
172 |         )
173 |         image_url = url_for('static', filename=f'shap_plots/pattern_{i}.png')
174 |         image_path = f"static/shap_plots/pattern_{i}.png"
175 |         plt.savefig(image_path)
176 |         shap_image_paths.append(image_url)
177 |         plt.close()
178 |         
179 |         # WATERFALL PLOT
180 |         plt.figure()
181 |         plt.tight_layout()
182 |         # Create an Explanation object
183 |         shap_explanation = shap.Explanation(
184 |             values=shap_values[i],
185 |             base_values=explainer.expected_value, 
186 |             data=feature_vectors[i], 
187 |             feature_names=get_feature_names()
188 |         )
189 |         # Generate a waterfall plot for the i-th prediction
190 |         shap.plots.waterfall(shap_explanation, max_display=14, show=False)
191 |         waterfall_image_path = f"static/shap_plots/waterfall_pattern_{i}.png"
192 |         plt.savefig(waterfall_image_path, bbox_inches='tight')
193 |         shap_waterfall_image_paths.append(url_for('static', filename=f"shap_plots/waterfall_pattern_{i}.png"))
194 |         plt.close()
195 | 
196 |     # Combine predictions, pattern names, and their corresponding SHAP values
197 |     top_patterns_with_shap = [
198 |         (pattern_name[i], predictions[i], shap_image_paths[j], shap_waterfall_image_paths[j])
199 |         for j, i in enumerate(sorted_indices)
200 |     ]
201 | 
202 |     return top_patterns_with_shap
203 | 
204 | if __name__ == '__main__':
205 |     app.run(debug=True)
206 | 


--------------------------------------------------------------------------------
/annotation-program/templates/annotate.html:
--------------------------------------------------------------------------------
  1 | {% extends 'base.html' %}
  2 | 
  3 | {% block title %}
  4 |     Choose the Candidates
  5 | {% endblock %}
  6 | 
  7 | {% block content %}
  8 | <style>
  9 |     .modal-content {
 10 |         padding: 0;
 11 |     }
 12 | 
 13 |     .modal-body {
 14 |         padding: 15px;  /* Adjust as necessary */
 15 |         max-height: calc(100vh - 210px); /* You may need to adjust this value */
 16 |         overflow-y: auto; /* Allow scrolling inside modal body */
 17 |     }
 18 | 
 19 |     .submit-button {
 20 |         position: fixed;
 21 |         bottom: 20px;
 22 |         right: 20px;
 23 |     }
 24 | 
 25 |     /* styles.css */
 26 |     .star-rating {
 27 |     direction: rtl;
 28 |     display: inline-block;
 29 |     padding: 0;
 30 |     margin: 0;
 31 |     }
 32 | 
 33 |     .star-rating input[type=radio] {
 34 |     display: none;
 35 |     }
 36 | 
 37 | 
 38 |     .star-rating label {
 39 |     color: #ddd;
 40 |     float: right;
 41 |     font-size: 30px;
 42 |     }
 43 | 
 44 |     .star-rating input[type=radio]:checked ~ label,
 45 |     .star-rating label:hover,
 46 |     .star-rating label:hover ~ label {
 47 |     color: #ffc107;
 48 |     }
 49 | 
 50 |     .star-rating input[type=radio]:checked + label:hover,
 51 |     .star-rating input[type=radio]:checked ~ label:hover,
 52 |     .star-rating label:hover ~ input[type=radio]:checked ~ label {
 53 |     color: #c09;
 54 |     }
 55 | 
 56 | </style>
 57 | <div class="container mt-5">
 58 |     <div class="progress mt-4">
 59 |         <div class="progress-bar" role="progressbar" 
 60 |             style="width: {{ progress|round(2) }}%;" 
 61 |             aria-valuenow="{{ progress|round(2) }}" 
 62 |             aria-valuemax="100">
 63 |             {{ progress|round(2) }}%
 64 |         </div>
 65 |     </div>
 66 | 
 67 | {% if similar_inquery_data %}
 68 | <button type="button" class="btn btn-info" data-toggle="modal" data-target="#preAnnotatedModal">
 69 |   Expert Annotated
 70 | </button>
 71 | {% endif %}
 72 | 
 73 | 
 74 | 
 75 |     <form action="{{ url_for('annotate', source=query.source) }}" method="POST">
 76 |       <div class="mb-3">
 77 |         <input type="hidden" name="query_id" value="{{ query.id }}">
 78 |         <label for="query" class="form-label">Privacy Requirement</label>
 79 |         <h2>#{{ query.id }}</h2>
 80 |         <h4>{{ query.text }}</h4>
 81 |         tags: 
 82 |         {% for tag in query.tags %}
 83 |         <a target="_blank" href="https://privacypatterns.org/categories/{{ tag.name }}/">{{ tag.name }}</a>{% if not loop.last %}, {% endif %}
 84 |         {% endfor %}
 85 |       </div>
 86 |   
 87 |       <table class="table table-striped table-hover">
 88 |           <thead>
 89 |               <tr>
 90 |                   <th>Privacy Design Pattern</th>
 91 |                   <th>Relevance</th>
 92 |               </tr>
 93 |           </thead>
 94 |           <tbody>
 95 |               {% for candidate in recommended_candidates %}
 96 |               <tr>
 97 |                   <td style="text-align: left; width: 60%;">
 98 |                     <h4>{{ candidate.candidate.text }}</h4>
 99 |                     <p>{{ candidate.candidate.description }}...<a href="{{ candidate.candidate.source }}" target="_blank">more</a></p> 
100 |                     <br>
101 |                     tags: 
102 |                     {% for tag in candidate.candidate.tags %}
103 |                     <a target="_blank" href="https://privacypatterns.org/categories/{{ tag.name }}/">{{ tag.name }}</a>{% if not loop.last %}, {% endif %}
104 |                     {% endfor %}
105 | 
106 |                 </td>
107 |                   <td>
108 |                       <div class="star-rating">
109 |                           {% for i in range(5, 0, -1) %}
110 |                           <input type="radio" id="{{ candidate.candidate.id }}-{{ i }}" name="{{ candidate.candidate.id }}" value="{{ i }}" {% if candidate.relevance == i %}checked{% endif %}><label for="{{ candidate.candidate.id }}-{{ i }}">☆</label>
111 |                           {% endfor %}
112 |                       </div>
113 |                   </td>
114 |               </tr>
115 |               {% endfor %}
116 | 
117 | 
118 |               {% for candidate in other_candidates %}
119 |               <tr>
120 |                   <td style="text-align: left; width: 60%;">
121 |                     <h4>{{ candidate.text }}</h4>
122 |                     <p>{{ candidate.description }}...<a href="{{ candidate.source }}" target="_blank">more</a></p> 
123 |                     <br>
124 |                     tags: 
125 |                     {% for tag in candidate.tags %}
126 |                     <a target="_blank" href="https://privacypatterns.org/categories/{{ tag.name }}/">{{ tag.name }}</a>{% if not loop.last %}, {% endif %}
127 |                     {% endfor %}
128 | 
129 |                 </td>
130 |                   <td>
131 |                       <div class="star-rating">
132 |                           {% for i in range(5, 0, -1) %}
133 |                           <input type="radio" id="{{ candidate.id }}-{{ i }}" name="{{ candidate.id }}" value="{{ i }}"><label for="{{ candidate.id }}-{{ i }}">☆</label>
134 |                           {% endfor %}
135 |                       </div>
136 |                   </td>
137 |               </tr>
138 |               {% endfor %}
139 |           </tbody>
140 |       </table>
141 |       <button type="submit" class="btn btn-primary submit-button">Submit</button>
142 |       
143 |     </form>
144 |   </div>
145 | 
146 | 
147 |   {% if similar_inquery_data %}
148 |   <div class="modal" id="preAnnotatedModal">
149 |       <div class="modal-dialog modal-lg"> <!-- Increase modal size with modal-lg -->
150 |           <div class="modal-content">
151 |               <div class="modal-header">
152 |                   <h5 class="modal-title">Pre-annotated Score Available</h5>
153 |                   <button type="button" class="close" data-dismiss="modal">&times;</button>
154 |               </div>
155 |               <div class="modal-body overflow-auto"> <!-- Add overflow auto -->
156 |                       <h3>Semantically Similar Annotated Data</h3>
157 |                       <div class="table-responsive">
158 |                         <table class="table table-striped table-hover">
159 |                             <thead>
160 |                                 <tr>
161 |                                     <th scope="col" style="width: 2%;">#</th>
162 |                                     <th scope="col" style="width: 30%;">Query</th>
163 |                                     <th scope="col" style="width: 50%;">Candidate</th>
164 |                                     <th scope="col" style="width: 5%;">Action</th>
165 |                                 </tr>
166 |                             </thead>
167 |                             
168 |                             <tbody>
169 |                                 {% for data in similar_inquery_data %}
170 |                                     <tr>
171 |                                         <th scope="row">{{ data.id }}</th>
172 |                                         <td>{{ data.query }} (<span class="label label-default">Score: {{ data.score }}</span>) <br><br>  <span class="label label-primary">{{ data.tags|join(', ') }}</span></td>
173 |                                         <td>
174 |                                             {% set sorted_candidates = data.candidates|sort(attribute='1', reverse=True) %}
175 |                                             {% for candidate, relevance in sorted_candidates %}
176 |                                             {{ candidate }} ({{ relevance }}){% if not loop.last %}, {% endif %}
177 |                                             {% endfor %}
178 |                                         </td>
179 |                                         <td>
180 |                                             <form action="{{ url_for('apply_annotation') }}" method="POST">
181 |                                                 <input type="hidden" name="previous_query_id" value="{{ data.id }}">
182 |                                                 <input type="hidden" name="current_query_id" value="{{ query.id }}">
183 |                                                 <input type="hidden" name="source" value="{{ query.source }}">
184 |                                                 <td><button type="submit" class="btn btn-sm btn-primary">Apply</button></td>
185 |                                             </form>
186 |                                         </td>
187 |                                     </tr>
188 |                                 {% endfor %}
189 |                             </tbody>
190 |                             
191 |                             
192 |                         </table>
193 |                         
194 |                         
195 |                     </div>
196 |               </div>
197 |               <div class="modal-footer">
198 |                   <button type="button" class="btn btn-secondary" data-dismiss="modal">Close</button>
199 |               </div>
200 |           </div>
201 |       </div>
202 |   </div>
203 |   
204 |   
205 |   {% endif %}
206 |   
207 | 
208 | {% endblock %}
209 | 


--------------------------------------------------------------------------------
/classification/sklearn_classifier.py:
--------------------------------------------------------------------------------
  1 | import json, pickle
  2 | 
  3 | from sklearn.feature_extraction.text import CountVectorizer
  4 | from sklearn.naive_bayes import MultinomialNB
  5 | from sklearn.model_selection import cross_val_score
  6 | from sklearn.model_selection import train_test_split
  7 | from sklearn.feature_extraction.text import CountVectorizer
  8 | from sklearn.model_selection import StratifiedKFold
  9 | 
 10 | import numpy as np
 11 | 
 12 | import json
 13 | 
 14 | from text_preprocessing import preprocess_text
 15 | from text_preprocessing import to_lower, remove_stopword, lemmatize_word
 16 | 
 17 | preprocess_functions = [to_lower, remove_stopword, lemmatize_word]
 18 | 
 19 | privacy_objectives = {
 20 |     "anonymity" : ["Protection-against-tracking", "Location-granularity", "Pseudonymous-messaging", "Onion-routing", "Anonymous-reputation-based-blacklisting", "Attribute-based-credentials", "Anonymity-set"],
 21 |     
 22 |     "unlinkability" : ["Protection-against-tracking", "Location-granularity", "Pseudonymous-messaging", "Onion-routing", "Anonymous-reputation-based-blacklisting", "Attribute-based-credentials", "Decoupling-[content]-and-location-information-visibility","Active-broadcast-of-presence","Trustworthy-privacy-plugin"],
 23 |     
 24 |     "confidentiality" : ["Informed-Secure-Passwords", "Encryption-user-managed-keys", "Personal-data-store", "Aggregation-gateway", "Single-Point-of-Contact", "User-data-confinement-pattern", "Selective-Access-Control", "Buddy-List", "Added-noise-measurement-obfuscation", "Trustworthy-privacy-plugin", "Support-Selective-Disclosure", "Private-link", "Active-broadcast-of-presence", "Unusual-activities"],
 25 |     
 26 |     "plausible_deniability" : ["Location-granularity", "Use-of-dummies", "Onion-routing", "Pseudonymous-identity", "Added-noise-measurement-obfuscation", "Attribute-based-credentials", "Anonymity-set"],
 27 |     
 28 |     "undetectability" : ["Location-granularity", "Use-of-dummies", "Aggregation-gateway", "Trustworthy-privacy-plugin", "Active-broadcast-of-presence"],
 29 |     
 30 |     "manageability" : ["Federated-privacy-impact-assessment", "Data-breach-notification-pattern", "Trust-Evaluation-of-Services-Sides", "Sign-an-Agreement-to-Solve-Lack-of-Trust-on-the-Use-of-Private-Data-Context", "Obligation-management", "Privacy-Aware-Wording", "Sticky-policy"],
 31 |     
 32 |     "intervenability" : ["Minimal-Information-Asymmetry", "Informed-Secure-Passwords", "Awareness-Feed", "Encryption-user-managed-keys", "Whos-Listening", "Discouraging-blanket-strategies", "Outsourcing-[with-consent]", "Personal-data-store", "Single-Point-of-Contact", "Enable-Disable-Functions", "Obtaining-Explicit-Consent", "Decoupling-[content]-and-location-information-visibility", "Selective-Access-Control", "Informed-Credential-Selection", "Reasonable-Level-of-Control", "Masquerade", "Buddy-List", "Lawful-Consent", "Sticky-policy", "Personal-Data-Table", "Informed-Consent-for-Web-based-Transactions", "Support-Selective-Disclosure", "Private-link", "Active-broadcast-of-presence"],
 33 |     
 34 |     "transparency" : ["Minimal-Information-Asymmetry", "Informed-Secure-Passwords", "Awareness-Feed", "Whos-Listening", "Privacy-Policy-Display", "Layered-policy-design", "Asynchronous-notice", "Abridged-Terms-and-Conditions", "Policy-matching-display", "Ambient-notice", "Dynamic-Privacy-Policy-Display", "Privacy-Labels", "Data-breach-notification-pattern", "Trust-Evaluation-of-Services-Sides", "Appropriate-Privacy-Icons", "Privacy-aware-network-client", "Informed-Implicit-Consent", "Privacy-color-coding", "Icons-for-Privacy-Policies", "Obtaining-Explicit-Consent", "Privacy-Mirrors", "Appropriate-Privacy-Feedback", "Impactful-Information-and-Feedback", "Platform-for-Privacy-Preferences", "Privacy-dashboard", "Preventing-Mistakes-or-Reducing-Their-Impact", "Informed-Credential-Selection", "Privacy-Awareness-Panel", "Lawful-Consent", "Privacy-Aware-Wording", "Sticky-policy", "Personal-Data-Table", "Informed-Consent-for-Web-based-Transactions", "Increasing-Awareness-of-Information-Aggregation", "Unusual-activities"],
 35 | }
 36 | 
 37 | hard_goal = ["unlinkability","anonymity","pseudonym","undetectability","confidentiality","plausible_deniability"] 
 38 | soft_goal = ["transparency","intervenability","content_awareness"]
 39 | skip_goal = ["availability", "integrity"]
 40 | 
 41 | hard_pattern = []
 42 | soft_pattern = []
 43 | 
 44 | for g in hard_goal:
 45 |     if g not in privacy_objectives:
 46 |         continue
 47 | 
 48 |     for o in privacy_objectives[g]:
 49 |         hard_pattern.append(o)
 50 | 
 51 | for g in soft_goal:
 52 |     if g not in privacy_objectives:
 53 |         continue
 54 | 
 55 |     for o in privacy_objectives[g]:
 56 |         soft_pattern.append(o)
 57 | 
 58 | def get_data(multiclass=False):
 59 |     req_path = "../data/requirements.json"
 60 | 
 61 |     with open(req_path, 'r') as p:
 62 |         requirements = json.loads(p.read())
 63 | 
 64 |     text, label = [], []
 65 |     for r in requirements["rows"]:
 66 |       text.append(preprocess_text(r["req_text"], preprocess_functions))
 67 |       lbl = r["req_type"].replace("_3","").replace("_2","").replace("_1","")
 68 | 
 69 |       if multiclass:
 70 |         label.append(lbl)
 71 |       else:
 72 |         label.append(1 if lbl in hard_goal else 0)
 73 | 
 74 |     return text, label
 75 | 
 76 | def append_text_label(filepath):
 77 |     text, label, label_unique = [], [], []
 78 | 
 79 |     # MAKE LABEL AS INDEX
 80 |     with open(filepath,"r",encoding='utf-8') as dd:
 81 |         for d in dd:
 82 |             if len(d)<=8:
 83 |                 continue
 84 | 
 85 |             l = d.split()[0].replace("__label__","")
 86 | 
 87 |             if l in skip_goal:
 88 |                 continue
 89 | 
 90 |             label_unique.append(l)
 91 | 
 92 |     label_unique = list(set(label_unique))
 93 |     print(label_unique)
 94 | 
 95 |     with open(filepath,"r") as dd:
 96 |         for d in dd:
 97 |             if len(d)<=8:
 98 |                 continue
 99 | 
100 |             l = d.split()[0].replace("__label__","")
101 | 
102 |             if l in skip_goal:
103 |                 continue
104 | 
105 |             label.append(label_unique.index(l))
106 |             text.append(" ".join(d.split()[1:]))
107 | 
108 |             # if d[9].strip() in ['0','1']:
109 |             #     label.append(d[9].strip())
110 |             #     text.append(d[11:].strip())
111 | 
112 |     return text, label
113 | 
114 | 
115 | def get_data_from_file(with_aug=True, combine_test=True, binary="binary"):
116 |     text, label = [], []
117 | 
118 |     if with_aug:
119 |         text_temp, label_temp = append_text_label("data/privacy_{}_data_train_aug.txt".format(binary))
120 |         text.extend(text_temp)
121 |         label.extend(label_temp)
122 | 
123 |     else:
124 |         text_temp, label_temp = append_text_label("data/privacy_{}_data_train.txt".format(binary))
125 |         text.extend(text_temp)
126 |         label.extend(label_temp)
127 | 
128 |     if combine_test:
129 |         text_temp, label_temp = append_text_label("data/privacy_{}_data_test.txt".format(binary))
130 |         text.extend(text_temp)
131 |         label.extend(label_temp)
132 | 
133 |     return text, label
134 | 
135 | def test_cross_val():
136 |     text, label = get_data_from_file(with_aug=False, combine_test=True, binary="multi")
137 | 
138 |     count_vect = CountVectorizer(analyzer="word", ngram_range=(1,1))
139 |     train_data = count_vect.fit_transform(text)
140 | 
141 | 
142 |     clf1 = MultinomialNB()
143 | 
144 |     print ("Naive Bayes:", np.mean(cross_val_score(clf1, train_data, label, scoring='f1_macro',cv=5))) 
145 | 
146 | def make_classifier():
147 |     text, label = get_data_from_file(with_aug=True, combine_test=False, binary="multi")
148 | 
149 |     count_vect = CountVectorizer(analyzer="word", ngram_range=(1,1))
150 |     train_data = count_vect.fit_transform(text)
151 | 
152 |     filename = 'multi_vectorizer_model.sav'
153 |     pickle.dump(count_vect, open(filename, 'wb'))
154 | 
155 |     clf1 = MultinomialNB()
156 |     clf1.fit(train_data, label)
157 | 
158 |     filename = 'multi_nb_model.sav'
159 |     pickle.dump(clf1, open(filename, 'wb'))
160 | 
161 | def predict_class(texts, model, vectorizer):
162 |     loaded_model = pickle.load(open(model, 'rb'))
163 |     loaded_vect = pickle.load(open(vectorizer, 'rb'))
164 | 
165 |     text = [preprocess_text(t, preprocess_functions) for t in texts]
166 |     v_text = loaded_vect.transform(text)
167 |     
168 |     prediction = loaded_model.predict(v_text)
169 | 
170 |     return prediction
171 | 
172 | def test_classifier():
173 |     test_data = "../data/sec_compass.json"
174 | 
175 |     with open(test_data, 'r', encoding="utf-8-sig") as p:
176 |         requirements = json.loads(p.read())
177 | 
178 |     req_type = [r["req_type"] for r in requirements["rows"]]
179 |     texts = [r["req_text"] for r in requirements["rows"]]
180 |     prediction = predict_class(texts,'nb_model.sav','vectorizer_model.sav')
181 | 
182 |     for i,p in enumerate(prediction):
183 |         print(req_type[i], p)
184 | 
185 | def test_classifier_on_pattern():
186 |     pattern_file = "../data/patterns.json"
187 | 
188 |     with open(pattern_file, 'r') as p:
189 |         patterns = json.loads(p.read())
190 | 
191 |     texts = []
192 | 
193 |     for pattern in patterns:
194 |         pattern_text = []
195 | 
196 |         pattern_text.append(pattern["excerpt"])
197 | 
198 |         for heading in pattern["heading"]:
199 |             pattern_text.append(heading["content"].strip())
200 | 
201 |         texts.append(". ".join(pattern_text))
202 | 
203 |     # BINARY CLASS
204 |     prediction = predict_class(texts,"binary_nb_model.sav","binary_vectorizer_model.sav")
205 | 
206 |     for i,p in enumerate(prediction):
207 |         print(patterns[i]["title"], p)
208 | 
209 |     print("=="*20)
210 | 
211 |     # MULTI CLASS
212 |     prediction = predict_class(texts,"multi_nb_model.sav","multi_vectorizer_model.sav")
213 | 
214 |     for i,p in enumerate(prediction):
215 |         print(patterns[i]["title"], p)
216 | 
217 | 
218 | 
219 | def test_classifier_flair():
220 |     from flair.data import Sentence
221 |     from flair.models import TextClassifier
222 | 
223 |     # only for binary
224 |     # after testing this model, I dont know why it shows all 1
225 |     model = TextClassifier.load('binary/glove-roberta-best-model.pt') # create example sentence
226 | 
227 |     test_data = "../data/sec_compass.json"
228 | 
229 |     with open(test_data, 'r', encoding="utf-8-sig") as p:
230 |         reqs = json.loads(p.read())
231 | 
232 |     for r in reqs["rows"]:
233 |         sentence = Sentence(r["req_text"])
234 |         model.predict(sentence)
235 |         print("True Type:",r["req_type"],r["req_type"])
236 |         # print(sentence.labels)
237 |         print(sentence.labels[0].value, sentence.labels[0].score)
238 |         print()
239 | 
240 | def reduce_test_data():
241 |     test_data = "../data/propan_patterns_requirements.json"
242 | 
243 |     with open(test_data, 'r', encoding="utf-8-sig") as p:
244 |         requirements = json.loads(p.read())
245 | 
246 |     texts = [r["req_text"] for r in requirements]
247 | 
248 |     prediction = predict_class(texts)
249 | 
250 |     new_test_data = []
251 |     for i, label in enumerate(prediction):
252 |         reduced_pattern = []
253 |         goal = soft_pattern
254 |         if label == "1":
255 |             goal = hard_pattern
256 | 
257 |         print(goal)
258 | 
259 |         for p in requirements[i]["pattern"]:
260 |             if p["name"] in goal:
261 |                 reduced_pattern.append(p)
262 | 
263 |         new_test_data.append({"id":requirements[i]["id"],"req_text":requirements[i]["req_text"],"pattern":reduced_pattern})
264 | 
265 |     with open("../data/reduced_propan_patterns_requirements.json", "w") as outfile:
266 |         json.dump(new_test_data, outfile, indent=3)
267 | 
268 | 
269 | def classify_new_data():
270 |     json_file_path = 'all_inqueries.json'
271 |     
272 |     with open(json_file_path, 'r') as json_file:
273 |         data = json.load(json_file)
274 |         
275 |         texts = [d["req_text"] for d in data]
276 |         
277 |         prediction = predict_class(texts, "model/binary_nb_model.sav", "model/binary_vectorizer_model.sav")
278 |         
279 |         # Return data and prediction labels
280 |         return data, prediction
281 | 
282 | def split_dataset_into_5fold():
283 |     data, labels = classify_new_data()
284 | 
285 |     # Create a dictionary to hold data grouped by labels
286 |     grouped_data = {
287 |         0: [],
288 |         1: []
289 |     }
290 | 
291 |     for d, label in zip(data, labels):
292 |         grouped_data[int(label)].append(d)
293 | 
294 |     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
295 | 
296 |     folds = []
297 | 
298 |     for train_idx, test_idx in skf.split(data, labels):
299 |         train_data = [data[i] for i in train_idx]
300 |         test_data = [data[i] for i in test_idx]
301 |         
302 |         train_labels = [labels[i] for i in train_idx]
303 |         test_labels = [labels[i] for i in test_idx]
304 | 
305 |         folds.append((train_data, test_data, train_labels, test_labels))
306 | 
307 |     return folds
308 | 
309 | def split_dataset_uniformly():
310 |     data, labels = classify_new_data()
311 | 
312 |     # Create a dictionary to hold data grouped by labels
313 |     grouped_data = {
314 |         0: [],
315 |         1: []
316 |     }
317 |     
318 |     for d, label in zip(data, labels):
319 |         grouped_data[int(label)].append(d)
320 | 
321 |     # Split each group into train, dev, and test sets
322 |     train, test = [], []
323 |     for label, items in grouped_data.items():
324 |         train_set, test_set = train_test_split(items, test_size=0.2, random_state=42)  # Splitting 80% train, 20% test
325 |         
326 |         train.extend(train_set)
327 |         test.extend(test_set)
328 | 
329 |     return train, test
330 | 
331 | def export_to_json(data, filename):
332 |     """Export data to a JSON file."""
333 |     with open(filename, 'w') as json_file:
334 |         json.dump(data, json_file, indent=4)
335 |         
336 | folds = split_dataset_into_5fold()
337 | 
338 | for fold_num, (train_data, test_data, train_labels, test_labels) in enumerate(folds, 1):
339 |     # Exporting train data for each fold to JSON
340 |     train_filename = f'train_patterns_req_v2_fold_{fold_num}.json'
341 |     export_to_json(train_data, train_filename)
342 |     
343 |     # Exporting test data for each fold to JSON
344 |     test_filename = f'test_patterns_req_v2_fold_{fold_num}.json'
345 |     export_to_json(test_data, test_filename)
346 | 
347 |     
348 | # test_cross_val()
349 | # make_classifier()
350 | # test_classifier()
351 | 
352 | # test_classifier_on_pattern()
353 | 
354 | # reduce_test_data()
355 | 


--------------------------------------------------------------------------------
/annotation-program/app.py:
--------------------------------------------------------------------------------
  1 | from flask_sqlalchemy import SQLAlchemy
  2 | from flask import Flask, render_template, request, redirect, url_for, flash, jsonify, Response
  3 | from flask_bootstrap import Bootstrap
  4 | from flask_login import LoginManager, UserMixin, login_user, login_required, logout_user, UserMixin, current_user
  5 | from sqlalchemy.orm import relationship
  6 | from sqlalchemy.sql import func
  7 | from sqlalchemy import desc
  8 | from flask_migrate import Migrate
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | from werkzeug.wrappers import Response as ResponseBase
 12 | 
 13 | # from sentence_transformers import SentenceTransformer
 14 | 
 15 | # Initialize the sentence-transformers model
 16 | # model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
 17 | 
 18 | app = Flask(__name__)
 19 | app.config["SQLALCHEMY_DATABASE_URI"] = 'sqlite:///ltr_annotation.db'
 20 | app.config['SECRET_KEY'] = 'klaf9897fwehkwe' # Replace this with a real secret key
 21 | 
 22 | Bootstrap(app)
 23 | db = SQLAlchemy(app)
 24 | 
 25 | 
 26 | migrate = Migrate(app, db)
 27 | 
 28 | login_manager = LoginManager()
 29 | login_manager.init_app(app)
 30 | login_manager.login_view = 'login'
 31 | 
 32 | class User(UserMixin, db.Model):
 33 |     id = db.Column(db.Integer, primary_key=True)
 34 |     username = db.Column(db.String(80), unique=True, nullable=False)
 35 |     password = db.Column(db.String(120), nullable=False)
 36 |     annotations = relationship("Annotation", back_populates="user")
 37 | 
 38 | class Tag(db.Model):
 39 |     id = db.Column(db.Integer, primary_key=True)
 40 |     name = db.Column(db.String(100), nullable=False)
 41 |     
 42 | query_tags = db.Table('query_tags',
 43 |     db.Column('tag_id', db.Integer, db.ForeignKey('tag.id'), primary_key=True),
 44 |     db.Column('query_id', db.Integer, db.ForeignKey('inquery.id'), primary_key=True)
 45 | )
 46 | 
 47 | candidate_tags = db.Table('candidate_tags',
 48 |     db.Column('tag_id', db.Integer, db.ForeignKey('tag.id'), primary_key=True),
 49 |     db.Column('candidate_id', db.Integer, db.ForeignKey('candidate.id'), primary_key=True)
 50 | )
 51 | 
 52 | class Inquery(db.Model):
 53 |     id = db.Column(db.Integer, primary_key=True)
 54 |     text = db.Column(db.String(200), nullable=False)
 55 |     source = db.Column(db.String(200), nullable=True)
 56 |     tags = db.relationship('Tag', secondary=query_tags, lazy='subquery',
 57 |         backref=db.backref('inqueries', lazy=True))
 58 | 
 59 |     
 60 | class Candidate(db.Model):
 61 |     id = db.Column(db.Integer, primary_key=True)
 62 |     text = db.Column(db.String(200), nullable=False)
 63 |     description = db.Column(db.String(1000), nullable=False)
 64 |     source = db.Column(db.String(200), nullable=True)
 65 |     tags = db.relationship('Tag', secondary=candidate_tags, lazy='subquery',
 66 |         backref=db.backref('candidates', lazy=True))
 67 | 
 68 | class CandidatesRecommended(db.Model):
 69 |     id = db.Column(db.Integer, primary_key=True)
 70 |     query_id = db.Column(db.Integer, db.ForeignKey('inquery.id'), nullable=False)
 71 |     candidate_id = db.Column(db.Integer, db.ForeignKey('candidate.id'), nullable=False)
 72 |     relevance = db.Column(db.Float, nullable=False)
 73 |     query_data = relationship("Inquery", backref="recommended_candidates")
 74 |     candidate = relationship("Candidate")
 75 | 
 76 | class Annotation(db.Model):
 77 |     id = db.Column(db.Integer, primary_key=True)
 78 |     user_id = db.Column(db.Integer, db.ForeignKey('user.id'), nullable=False)
 79 |     query_id = db.Column(db.Integer, db.ForeignKey('inquery.id'), nullable=False)
 80 |     candidate_id = db.Column(db.Integer, db.ForeignKey('candidate.id'), nullable=False)
 81 |     rank = db.Column(db.Integer, nullable=False)
 82 |     relevance = db.Column(db.Float, nullable=False)
 83 |     timestamp = db.Column(db.DateTime(timezone=True), server_default=func.now())
 84 |     user = relationship("User", back_populates="annotations")
 85 |     query_data = relationship("Inquery")
 86 |     candidate = relationship("Candidate")
 87 | 
 88 | class Similarity(db.Model):
 89 |     id = db.Column(db.Integer, primary_key=True)
 90 |     inquery_id = db.Column(db.Integer, db.ForeignKey('inquery.id'), nullable=False)
 91 |     other_inquery_id = db.Column(db.Integer, nullable=False)
 92 |     score = db.Column(db.Float, nullable=False)
 93 |     
 94 | @login_manager.user_loader
 95 | def load_user(user_id):
 96 |     return User.query.get(int(user_id))
 97 | 
 98 | @app.route('/')
 99 | @login_required
100 | def home():
101 |     total_queries_by_source = db.session.query(Inquery.source, func.count(Inquery.id)).group_by(Inquery.source).all()
102 | 
103 |     # Create a subquery that groups by query_id and only select query_id
104 |     subquery = db.session.query(Annotation.query_id).filter(Annotation.user_id==current_user.id).group_by(Annotation.query_id).subquery()
105 | 
106 |     # Now group the subquery result by source
107 |     total_queries_done_by_user = db.session.query(Inquery.source, func.count(subquery.c.query_id)).join(subquery, Inquery.id == subquery.c.query_id).group_by(Inquery.source).all()
108 | 
109 |     total_queries = {source: count for source, count in total_queries_by_source}
110 |     queries_done = {source: count for source, count in total_queries_done_by_user}
111 | 
112 |     # This creates a list of dictionaries, where each dictionary contains the source, total queries, and queries done by the user
113 |     queries_data = [{'source': source, 'total': total_queries[source], 'done': queries_done.get(source, 0)} for source in total_queries]
114 | 
115 |     return render_template('home.html', annotations_data=queries_data)
116 | 
117 | @app.route('/tag/<tag_name>')
118 | def tag(tag_name):
119 |     # Inquery the database to get all candidates with this tag
120 |     tag = Tag.query.filter_by(name=tag_name).first_or_404()
121 |     candidates = tag.candidates  # assuming a backref in your Tag model
122 |     return render_template('tag.html', tag=tag, candidates=candidates)
123 | 
124 | @app.route('/login', methods=['GET', 'POST'])
125 | def login():
126 |     if request.method == 'POST':
127 |         user = User.query.filter_by(username=request.form['username']).first()
128 |         if user and user.password == request.form['password']: # You should use hashed passwords in a real application
129 |             login_user(user)
130 |             return redirect(url_for('home'))
131 |     return render_template('login.html')
132 | 
133 | @app.route('/register', methods=['GET', 'POST'])
134 | def register():
135 |     if request.method == 'POST':
136 |         new_user = User(username=request.form['username'], password=request.form['password'])
137 |         db.session.add(new_user)
138 |         db.session.commit()
139 |         return redirect(url_for('login'))
140 |     return render_template('register.html')
141 | 
142 | @app.route('/logout')
143 | @login_required
144 | def logout():
145 |     logout_user()
146 |     return redirect(url_for('login'))
147 | 
148 | def calculate_tag_overlap(query_tags, other_query_tags):
149 |     return len(set(query_tags) & set(other_query_tags))
150 | 
151 | @app.route('/apply_annotation', methods=['POST'])
152 | @login_required
153 | def apply_annotation():
154 |     # Fetch the current user's id
155 |     user_id = current_user.id
156 |     
157 |     # Extract the query_id from the pre-annotated data and current_query_id from the form data
158 |     previous_query_id = request.form.get("previous_query_id")
159 |     current_query_id = request.form.get("current_query_id")
160 | 
161 |     sql = """
162 |     INSERT INTO annotation (user_id, query_id, candidate_id, rank, relevance, timestamp)
163 |     SELECT :user_id, :current_query_id, candidate_id, rank, relevance, datetime('now')
164 |     FROM annotation
165 |     WHERE query_id = :previous_query_id;
166 |     """
167 | 
168 |     params = {
169 |         "user_id": user_id,
170 |         "current_query_id": current_query_id,
171 |         "previous_query_id": previous_query_id
172 |     }
173 | 
174 |     db.session.execute(sql, params)
175 |     db.session.commit()
176 | 
177 |     
178 |     return redirect(url_for('annotate', source=request.form.get("source")))
179 | 
180 | 
181 | @app.route('/annotate/<source>', methods=['GET', 'POST'])
182 | @login_required
183 | def annotate(source):
184 |     if request.method == 'POST':
185 |         _save_annotations(request.form, current_user.id)
186 |         
187 |         return redirect(url_for('annotate', source=source))
188 |         
189 |     else:
190 |         query = _get_unannotated_query_for_user(current_user.id, source)
191 |         if not query:
192 |             flash("All queries have been annotated. Thank you for your contribution!", "info")
193 |             return redirect(url_for('home'))
194 | 
195 |         progress = _calculate_annotation_progress(source, current_user.id)
196 |         similar_inquery_data = get_recommended_queries_by_semantic_similarity(query)
197 |         recommended_candidates, other_candidates = _get_candidates(query, current_user.id)
198 | 
199 |         return render_template('annotate.html', query=query, similar_inquery_data=similar_inquery_data, 
200 |                                recommended_candidates=recommended_candidates, other_candidates=other_candidates, 
201 |                                progress=progress)
202 | 
203 | def _save_annotations(form_data, user_id):
204 |     for candidate_id, relevance in form_data.items():
205 |         if candidate_id == 'query_id':  # ignore the query text field
206 |             continue
207 |         annotation = Annotation(user_id=user_id, query_id=form_data["query_id"], candidate_id=candidate_id, 
208 |                                 relevance=float(relevance), rank=int(relevance))
209 |         db.session.add(annotation)
210 |     db.session.commit()
211 | 
212 | def _get_unannotated_query_for_user(user_id, source):
213 |     annotated_queries_ids = db.session.query(Annotation.query_id).join(Inquery, Annotation.query_id == Inquery.id)\
214 |         .filter(Annotation.user_id == user_id, Inquery.source == source).distinct()
215 |     return Inquery.query.filter(Inquery.source == source, Inquery.id.notin_(annotated_queries_ids)).first()
216 | 
217 | def _get_candidates(query, user_id):
218 |     recommended_candidates = CandidatesRecommended.query.filter_by(query_id=query.id).all()
219 |     recommended_candidates_ids = [candidate.candidate_id for candidate in recommended_candidates]
220 |     
221 |     annotated_candidates_ids = [annotation.candidate_id for annotation in 
222 |                                 Annotation.query.filter_by(query_id=query.id, user_id=user_id).all()]
223 |     
224 |     other_candidates = Candidate.query.filter(Candidate.id.notin_(recommended_candidates_ids), 
225 |                                               Candidate.id.notin_(annotated_candidates_ids)).all()
226 |     return recommended_candidates, other_candidates
227 | 
228 | def _calculate_annotation_progress(source, user_id):
229 |     total_queries = Inquery.query.filter_by(source=source).count()
230 |     annotated_queries_ids = db.session.query(Annotation.query_id).join(Inquery, Annotation.query_id == Inquery.id)\
231 |         .filter(Annotation.user_id == user_id, Inquery.source == source).distinct()
232 |     annotated_queries_count = annotated_queries_ids.count()
233 |     return (annotated_queries_count / total_queries) * 100
234 | 
235 | def get_recommended_queries_by_tags(query):
236 |     query_tags = [tag.name for tag in query.tags]
237 | 
238 | def get_recommended_queries_by_semantic_similarity(query):
239 |     query_tags = [tag.name for tag in query.tags]
240 |     
241 |     similar_inqueries_with_overlap = []
242 |     
243 |     # After fetching the query to annotate
244 |     threshold = 0.4
245 |     similarities = (
246 |         Similarity.query.filter(Similarity.inquery_id == query.id, Similarity.score >= threshold)
247 |         .order_by(desc(Similarity.score))
248 |         .all()
249 |     )
250 |     for similarity in similarities:
251 |         other_inquery = Inquery.query.get(similarity.other_inquery_id)
252 |         other_query_tags = [tag.name for tag in other_inquery.tags]
253 |         overlap_count = calculate_tag_overlap(query_tags, other_query_tags)
254 |         similar_inqueries_with_overlap.append((similarity, overlap_count))
255 | 
256 |     # Order the inqueries based on the number of overlapping tags (in descending order)
257 |     similar_inqueries_with_overlap.sort(key=lambda x: x[1], reverse=True)
258 |     
259 |     similar_inquery_data_dict = {}
260 |     for similarity, _ in similar_inqueries_with_overlap:
261 |         annotations = Annotation.query.filter_by(query_id=similarity.other_inquery_id).all()
262 |         if annotations:
263 |             for annotation in annotations:
264 |                 candidate = Candidate.query.get(annotation.candidate_id)
265 |                 if candidate:
266 |                     other_inquery = Inquery.query.get(similarity.other_inquery_id)
267 |                     tags = [tag.name for tag in other_inquery.tags]
268 | 
269 |                     if similarity.other_inquery_id not in similar_inquery_data_dict:
270 |                         similar_inquery_data_dict[similarity.other_inquery_id] = {
271 |                             "id": other_inquery.id,
272 |                             "query": other_inquery.text,
273 |                             "candidates": [(candidate.text, annotation.relevance)],
274 |                             "timestamp": annotation.timestamp.strftime("%Y-%m-%d %H:%M:%S"),
275 |                             "tags": tags,
276 |                             "score": "{:.2f}".format(similarity.score)
277 |                         }
278 |                     else:
279 |                         similar_inquery_data_dict[similarity.other_inquery_id]['candidates'].append(
280 |                             (candidate.text, annotation.relevance)
281 |                         )
282 | 
283 |     # Convert dictionary to a list of dictionaries for rendering in the template
284 |     similar_inquery_data = list(similar_inquery_data_dict.values())
285 |     
286 |     return similar_inquery_data
287 |             
288 | @app.route('/user_stats')
289 | @login_required
290 | def user_stats():
291 |     user_annotations = Annotation.query.filter_by(user_id=current_user.id).all()
292 |     annotations_data = {}
293 | 
294 |     for annotation in user_annotations:
295 |         try:
296 |             if annotation.query_data.id not in annotations_data:
297 |                 annotations_data[annotation.query_data.id] = {
298 |                     'query': annotation.query_data.text,
299 |                     'candidates': [(annotation.candidate.text, annotation.relevance)],
300 |                     'timestamp': annotation.timestamp.strftime("%Y-%m-%d %H:%M:%S")
301 |                 }
302 |             else:
303 |                 annotations_data[annotation.query_data.id]['candidates'].append(
304 |                     (annotation.candidate.text, annotation.relevance))
305 |         except AttributeError as e:
306 |             app.logger.error(f'AttributeError for annotation id {annotation.id}: {str(e)}')
307 | 
308 |     # Convert dictionary to a list of dictionaries for easier handling in the template
309 |     annotations_data = list(annotations_data.values())
310 | 
311 |     return render_template('user_stats.html', annotations_data=annotations_data)
312 | 
313 | @app.route('/precompute_similarity')
314 | def precompute_similarity():
315 |     # Retrieve all inqueries
316 |     inqueries = Inquery.query.all()
317 |     
318 |     # Extract texts from inqueries
319 |     texts = [inquery.text for inquery in inqueries]
320 |     
321 |     # Get embeddings for the texts
322 |     embeddings = model.encode(texts)
323 |     
324 |     # Compute pairwise semantic similarity scores
325 |     similarity_matrix = np.inner(embeddings, embeddings)
326 |     
327 |     # Store the precomputed similarities in the Similarity table
328 |     for i in tqdm(range(len(inqueries)), desc="Storing Similarities"):
329 |         inquery = inqueries[i]
330 |         for j, other_inquery in enumerate(inqueries):
331 |             if i != j:  # Skip self-similarity
332 |                 similarity = Similarity(inquery_id=inquery.id, other_inquery_id=other_inquery.id, score=similarity_matrix[i, j])
333 |                 db.session.add(similarity)
334 |     db.session.commit()
335 |     
336 | def get_data_for_all_inqueries():
337 |     # Fetch all inqueries
338 |     inqueries = Inquery.query.all()
339 |     
340 |     all_data = []
341 | 
342 |     for inquery in inqueries:
343 |         # Fetch all candidates
344 |         candidates = Candidate.query.all()
345 |         
346 |         pattern_list = []
347 |         
348 |         for candidate in candidates:
349 |             # Check if there's an annotation for the current inquery-candidate pair
350 |             annotation = Annotation.query.filter_by(query_id=inquery.id, candidate_id=candidate.id).first()
351 |             
352 |             if annotation:
353 |                 relevance = annotation.relevance
354 |             else:
355 |                 relevance = 0
356 | 
357 |             pattern_data = {
358 |                 "name": candidate.text,
359 |                 "rating": int(relevance)
360 |             }
361 |             pattern_list.append(pattern_data)
362 | 
363 |         # Format data for this inquery
364 |         data = {
365 |             "id": inquery.id,
366 |             "req_type": [tag.name for tag in inquery.tags],  # Convert to list
367 |             "req_name": f"{inquery.source} {inquery.id}",
368 |             "req_text": inquery.text,
369 |             "pattern": pattern_list
370 |         }
371 | 
372 |         all_data.append(data)
373 | 
374 |     return all_data
375 | 
376 | 
377 | @app.route('/download_all')
378 | def download_json():
379 |     data = get_data_for_all_inqueries()
380 |     if not data:
381 |         return "No inqueries found", 404
382 | 
383 |     # Convert the list of dictionaries to a JSON string
384 |     json_data = jsonify(data).get_data(as_text=True)
385 |     
386 |     # Create a Flask Response with headers for download
387 |     response = ResponseBase(json_data, content_type="application/json")
388 |     response.headers["Content-Disposition"] = "attachment; filename=all_inqueries.json"
389 |     return response
390 | 
391 | if __name__ == '__main__':
392 |     app.run(debug=True)
393 | 
394 | 


--------------------------------------------------------------------------------
/recommender/feature_creation.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | 
  3 | nltk.download('stopwords')
  4 | nltk.download('punkt')
  5 | 
  6 | import json, pickle
  7 | import numpy as np
  8 | from sklearn.feature_extraction.text import TfidfVectorizer
  9 | from scipy import sparse
 10 | import nltk
 11 | from nltk.corpus import stopwords
 12 | from nltk.tokenize import word_tokenize
 13 | from sentence_transformers import SentenceTransformer, util
 14 | from text_preprocessing import preprocess_text
 15 | from text_preprocessing import to_lower, remove_stopword, lemmatize_word
 16 | import torch
 17 | import os
 18 | 
 19 | preprocess_functions = [to_lower, remove_stopword, lemmatize_word]
 20 | 
 21 | PARENT_FOLDER = "data/"
 22 | 
 23 | '''
 24 | Construct features for learning-to-rank
 25 | The main function is the construct_features(q) which receives input of a query (which in our case a requirement)
 26 | then it is computed for each pattern in privacypatterns.org
 27 | '''
 28 | 
 29 | class PrivacyPatternFeatures(object):
 30 |     def __init__(self):
 31 |         self.patterns, self.pattern_titles, self.pattern_excerpts = self.get_corpus_pattern()
 32 |         self.initiate_tf_idf()
 33 |         self.initiate_bm25(0.75, 1.6)
 34 | 
 35 |         print("Loading LTR Embeddings...")
 36 |         # self.model_sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')
 37 |         # self.model_sentence_transformer_overflow = SentenceTransformer('flax-sentence-embeddings/stackoverflow_mpnet-base')
 38 | 
 39 |         self.model_sentence_transformer = SentenceTransformer('all-mpnet-base-v2')
 40 |         self.model_sentence_transformer_overflow = SentenceTransformer('dean-ai/legal_heBERT_ft')
 41 |         
 42 |         # self.precompute_pattern_embeddings()
 43 |         self.load_pattern_embeddings()
 44 | 
 45 |     def construct_features(self, q):
 46 |         q_words = word_tokenize(self.remove_stopwords(q))
 47 | 
 48 |         # we adapt the representation from MSLR-WEB dataset
 49 |         # q is query that represents the requirements
 50 |         # pattern is the document
 51 |         # each query have the pattern features
 52 |         # query level feature = when the parameter only contain q
 53 | 
 54 |         len_q = len(self.remove_stopwords(q))
 55 |         idf_q = self.get_idf(q_words)
 56 |         tf_idf_q = self.tf_idf_features(q)
 57 |         bm25 = self.bm25(q)
 58 |         binary_q, multi_q = self.class_features([q])
 59 |         binary_pattern, multi_pattern = self.class_features(self.patterns)
 60 | 
 61 |         cosine_pattern, cosine_title, cosine_excerpt, cosine_pattern_overflow, cosine_title_overflow, cosine_excerpt_overflow = self.semantic_similarity_features(q)
 62 |         deep_semantic_features = self.deep_semantic_interaction_features(q)
 63 | 
 64 |         features_all = []
 65 |         for i, pattern in enumerate(self.patterns):
 66 |             features = []
 67 |             features.extend(self.number_of_covered_words(q_words, pattern)) # 1, 2
 68 |             features.append(len_q) # 3
 69 |             features.append(idf_q) # 4
 70 |             features.extend(self.tf_features(q_words, pattern)) # 5 - 14
 71 |             features.extend(tf_idf_q) # 15 - 19
 72 |             features.append(bm25[i]) # 20
 73 |             features.append(float(cosine_pattern[0][i])) # 21
 74 |             features.append(float(cosine_title[0][i])) # 22
 75 |             features.append(float(cosine_excerpt[0][i])) # 23
 76 |             features.append(float(cosine_pattern_overflow[0][i])) # 24
 77 |             features.append(float(cosine_title_overflow[0][i])) # 25
 78 |             features.append(float(cosine_excerpt_overflow[0][i])) # 26
 79 | 
 80 |             features.append(binary_q[0])
 81 |             features.append(multi_q[0])
 82 |             features.append(binary_pattern[i])
 83 |             features.append(multi_pattern[i])
 84 | 
 85 |             # Append deep semantic interaction features: similarities
 86 |             features.append(float(deep_semantic_features["similarities"][0][0][i])) # pattern similarity
 87 |             features.append(float(deep_semantic_features["similarities"][1][0][i])) # title similarity
 88 |             features.append(float(deep_semantic_features["similarities"][2][0][i])) # excerpt similarity
 89 |             features.append(float(deep_semantic_features["similarities"][3][0][i])) # pattern similarity (overflow model)
 90 |             features.append(float(deep_semantic_features["similarities"][4][0][i])) # title similarity (overflow model)
 91 |             features.append(float(deep_semantic_features["similarities"][5][0][i])) # excerpt similarity (overflow model)
 92 | 
 93 |             # Append deep semantic interaction features: hadamard products
 94 |             features.extend(deep_semantic_features["hadamard_products"][0][i].tolist()) # pattern
 95 |             features.extend(deep_semantic_features["hadamard_products"][1][i].tolist()) # title
 96 |             features.extend(deep_semantic_features["hadamard_products"][2][i].tolist()) # excerpt
 97 |             features.extend(deep_semantic_features["hadamard_products"][3][i].tolist()) # pattern (overflow model)
 98 |             features.extend(deep_semantic_features["hadamard_products"][4][i].tolist()) # title (overflow model)
 99 |             features.extend(deep_semantic_features["hadamard_products"][5][i].tolist()) # excerpt (overflow model)
100 | 
101 |             # # Append deep semantic interaction features: concatenation
102 |             features.extend(deep_semantic_features["concatenations"][0][i].tolist()) # pattern
103 |             features.extend(deep_semantic_features["concatenations"][1][i].tolist()) # title
104 |             features.extend(deep_semantic_features["concatenations"][2][i].tolist()) # excerpt
105 |             features.extend(deep_semantic_features["concatenations"][3][i].tolist()) # pattern (overflow model)
106 |             features.extend(deep_semantic_features["concatenations"][4][i].tolist()) # title (overflow model)
107 |             features.extend(deep_semantic_features["concatenations"][5][i].tolist()) # excerpt (overflow model)
108 | 
109 |             features_all.append(features)
110 | 
111 |         return features_all
112 | 
113 |     def get_corpus_pattern(self):
114 |         pattern_file= PARENT_FOLDER + "patterns.json"
115 |         X = []
116 |         title = []
117 |         excerpt = []
118 |         with open(pattern_file, 'r') as p:
119 |             patterns = json.loads(p.read())
120 | 
121 |         for pattern in patterns:
122 |             text = ""
123 | 
124 |             filename = pattern["filename"].replace(".md","").replace("-"," ")
125 | 
126 |             title.append(filename)
127 |             excerpt.append(pattern["excerpt"].strip())
128 | 
129 |             text += filename
130 |             if not text.endswith("."):
131 |                 text += ". "
132 | 
133 |             text += pattern["excerpt"].strip()
134 |             if not text.endswith("."):
135 |                 text += ". "
136 | 
137 |             for heading in pattern["heading"]:
138 |                 text += heading["content"].strip()
139 |                 if not text.endswith("."):
140 |                     text += ". "
141 | 
142 |             X.append(text)
143 | 
144 |         X_new, title_new, excerpt_new = self.get_new_patterns()
145 | 
146 |         X.extend(X_new)
147 |         title.extend(title_new)
148 |         excerpt.extend(excerpt_new)
149 | 
150 |         return X, title, excerpt
151 | 
152 | 
153 |     def get_new_patterns(self):
154 |       pattern_file= PARENT_FOLDER + "patterns_new.json"
155 |       X = []
156 |       title = []
157 |       excerpt = []
158 |       with open(pattern_file, 'r') as p:
159 |           patterns = json.loads(p.read())
160 | 
161 |           for pattern in patterns:
162 |             X.append(pattern["description"])
163 |             title.append(pattern["title"])
164 |             excerpt.append(pattern["excerpt"])
165 | 
166 |       return X, title, excerpt
167 | 
168 |     def remove_stopwords(self, q):
169 |         stop_words = set(stopwords.words('english'))
170 |         word_tokens = word_tokenize(q)
171 |         filtered_sentence = " ".join([w for w in word_tokens if not w.lower() in stop_words])
172 | 
173 |         return filtered_sentence
174 | 
175 | 
176 |     def initiate_tf_idf(self):
177 |         self.tf_idf_vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
178 |         self.tf_idf_vectorizer.fit(self.patterns)
179 |         self.tf_idf_feature_names = self.tf_idf_vectorizer.get_feature_names_out()
180 | 
181 | 
182 |     def initiate_bm25(self, b, k1):
183 |         self.b = b
184 |         self.k1 = k1
185 | 
186 |         y = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(self.patterns)
187 |         self.avdl = y.sum(1).mean()
188 | 
189 | 
190 |     def bm25(self, q):
191 |         X = self.patterns
192 |         """ Calculate BM25 between query q and documents X """
193 |         b, k1, avdl = self.b, self.k1, self.avdl
194 | 
195 |         # apply CountVectorizer
196 |         X = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(X)
197 |         len_X = X.sum(1).A1
198 |         q, = super(TfidfVectorizer, self.tf_idf_vectorizer).transform([q])
199 |         assert sparse.isspmatrix_csr(q)
200 | 
201 |         # convert to csc for better column slicing
202 |         X = X.tocsc()[:, q.indices]
203 |         denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
204 |         # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
205 |         # to idf(t) = log [ n / df(t) ] with minus 1
206 |         idf = self.tf_idf_vectorizer._tfidf.idf_[None, q.indices] - 1.
207 |         numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
208 |         return (numer / denom).sum(1).A1
209 | 
210 |     def number_of_covered_words(self, q_words, pattern):
211 |         # How many terms in the user query are covered by the text.
212 |         # ration = Covered query term number divided by the number of query terms.
213 | 
214 |         n = 0
215 |         for word in q_words:
216 |             if word.lower() in pattern.lower():
217 |                 n += 1
218 | 
219 |         ratio = n/len(q_words)
220 |         return [n, ratio]
221 | 
222 |     def get_idf(self, q_words):
223 |         # 1 divided by the number of documents containing the query terms.
224 | 
225 |         n = 0
226 |         word_in_patterns = set()
227 |         for pattern in self.patterns:
228 |             for word in q_words:
229 |                 if word.lower() in pattern.lower():
230 |                     word_in_patterns.add(word.lower())
231 | 
232 |         if len(list(word_in_patterns)) == 0:
233 |           return 0
234 | 
235 |         idf = 1/len(list(word_in_patterns))
236 | 
237 |         return idf
238 | 
239 |     def tf_features(self, q_words, pattern):
240 |         # Sum, Min, Max, Average, Variance of counts of each query term in the document.
241 |         # Normalized version : term counts divided by text length
242 | 
243 |         pattern_words = word_tokenize(pattern)
244 |         total_len = len(pattern_words)
245 |         n_count_all = [pattern_words.count(word) for word in q_words]
246 | 
247 |         tf_sum, tf_min, tf_max, tf_avg, tf_var = sum(n_count_all), min(n_count_all), max(n_count_all), np.average(n_count_all), np.var(n_count_all)
248 | 
249 |         norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var = sum(n_count_all)/total_len, min(n_count_all)/total_len, max(n_count_all)/total_len, np.average(n_count_all)/total_len, np.var(n_count_all)/float(total_len)
250 | 
251 |         return [tf_sum, tf_min, tf_max, tf_avg, tf_var, norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var]
252 | 
253 | 
254 |     def tf_idf_features(self, q):
255 |         tfidf_matrix= self.tf_idf_vectorizer.transform([q]).todense()
256 |         feature_index = tfidf_matrix[0,:].nonzero()[1]
257 |         tfidf_scores = zip([self.tf_idf_feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
258 | 
259 |         word_scores = [score for score in dict(tfidf_scores).values()]
260 | 
261 |         tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var = sum(word_scores), min(word_scores), max(word_scores), np.average(word_scores), np.var(word_scores)
262 | 
263 |         return [tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var]
264 |     
265 |     def precompute_pattern_embeddings(self):
266 |         PARENT_FOLDER = ""
267 |         print("Precompute Pattern Embeddings")
268 |         # Compute embeddings for patterns
269 |         self.emb_pattern = self.model_sentence_transformer.encode(self.patterns, convert_to_tensor=True)
270 |         self.emb_pattern_title = self.model_sentence_transformer.encode(self.pattern_titles, convert_to_tensor=True)
271 |         self.emb_pattern_excerpt = self.model_sentence_transformer.encode(self.pattern_excerpts, convert_to_tensor=True)
272 | 
273 |         self.emb_pattern_overflow = self.model_sentence_transformer_overflow.encode(self.patterns, convert_to_tensor=True)
274 |         self.emb_pattern_title_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_titles, convert_to_tensor=True)
275 |         self.emb_pattern_excerpt_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_excerpts, convert_to_tensor=True)
276 | 
277 |         # Save the embeddings for later use
278 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl', 'wb') as f:
279 |             pickle.dump(self.emb_pattern, f)
280 | 
281 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title.pkl', 'wb') as f:
282 |             pickle.dump(self.emb_pattern_title, f)
283 | 
284 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt.pkl', 'wb') as f:
285 |             pickle.dump(self.emb_pattern_excerpt, f)
286 | 
287 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_overflow.pkl', 'wb') as f:
288 |             pickle.dump(self.emb_pattern_overflow, f)
289 | 
290 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title_overflow.pkl', 'wb') as f:
291 |             pickle.dump(self.emb_pattern_title_overflow, f)
292 | 
293 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt_overflow.pkl', 'wb') as f:
294 |             pickle.dump(self.emb_pattern_excerpt_overflow, f)
295 | 
296 |     def load_pattern_embeddings(self):
297 |         # Load the embeddings from the saved files
298 |         with open('LTR_resources/emb_pattern.pkl', 'rb') as f:
299 |             self.emb_pattern = pickle.load(f)
300 | 
301 |         with open('LTR_resources/emb_pattern_title.pkl', 'rb') as f:
302 |             self.emb_pattern_title = pickle.load(f)
303 | 
304 |         with open('LTR_resources/emb_pattern_excerpt.pkl', 'rb') as f:
305 |             self.emb_pattern_excerpt = pickle.load(f)
306 | 
307 |         with open('LTR_resources/emb_pattern_overflow.pkl', 'rb') as f:
308 |             self.emb_pattern_overflow = pickle.load(f)
309 | 
310 |         with open('LTR_resources/emb_pattern_title_overflow.pkl', 'rb') as f:
311 |             self.emb_pattern_title_overflow = pickle.load(f)
312 | 
313 |         with open('LTR_resources/emb_pattern_excerpt_overflow.pkl', 'rb') as f:
314 |             self.emb_pattern_excerpt_overflow = pickle.load(f)
315 | 
316 |     def semantic_similarity_features(self, q):
317 |         emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True)
318 | 
319 |         cosine_scores_pattern = util.cos_sim(emb_q, self.emb_pattern)
320 |         cosine_scores_title = util.cos_sim(emb_q, self.emb_pattern_title)
321 |         cosine_scores_excerpt = util.cos_sim(emb_q, self.emb_pattern_excerpt)
322 | 
323 |         emb_q = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True)
324 | 
325 |         cosine_scores_pattern_overflow = util.cos_sim(emb_q, self.emb_pattern_overflow)
326 |         cosine_scores_title_overflow = util.cos_sim(emb_q, self.emb_pattern_title_overflow)
327 |         cosine_scores_excerpt_overflow = util.cos_sim(emb_q, self.emb_pattern_excerpt_overflow)
328 | 
329 |         return cosine_scores_pattern, cosine_scores_title, cosine_scores_excerpt, cosine_scores_pattern_overflow, cosine_scores_title_overflow, cosine_scores_excerpt_overflow
330 | 
331 |     def hadamard_product(self, tensor1, tensor2):
332 |         return tensor1 * tensor2
333 | 
334 |     def deep_semantic_interaction_features(self, q):
335 |         # Encode the query
336 |         emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True)
337 |         emb_q_overflow = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True)
338 | 
339 |         # Compute the "ideal" similarity, which is the query with itself
340 |         ideal_emb = self.model_sentence_transformer.encode([q + " [SEP] " + q], convert_to_tensor=True)
341 |         ideal_emb_overflow = self.model_sentence_transformer_overflow.encode([q + " [SEP] " + q], convert_to_tensor=True)
342 | 
343 |         # Compute Hadamard product between query and pattern embeddings
344 |         hadamard_emb_pattern = self.hadamard_product(emb_q, self.emb_pattern)
345 |         hadamard_emb_pattern_title = self.hadamard_product(emb_q, self.emb_pattern_title)
346 |         hadamard_emb_pattern_excerpt = self.hadamard_product(emb_q, self.emb_pattern_excerpt)
347 | 
348 |         hadamard_emb_pattern_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_overflow)
349 |         hadamard_emb_pattern_title_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_title_overflow)
350 |         hadamard_emb_pattern_excerpt_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_excerpt_overflow)
351 | 
352 |         # Compute Concatenation between query and pattern embeddings
353 |         concat_emb_pattern = torch.cat((emb_q.unsqueeze(0), self.emb_pattern), dim=0)
354 |         concat_emb_pattern_title = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_title), dim=0)
355 |         concat_emb_pattern_excerpt = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_excerpt), dim=0)
356 | 
357 |         concat_emb_pattern_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_overflow), dim=0)
358 |         concat_emb_pattern_title_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_title_overflow), dim=0)
359 |         concat_emb_pattern_excerpt_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_excerpt_overflow), dim=0)
360 | 
361 |         # Compute similarity between the query embeddings and the precomputed pattern embeddings
362 |         similarities_pattern = util.pytorch_cos_sim(emb_q, self.emb_pattern)
363 |         similarities_title = util.pytorch_cos_sim(emb_q, self.emb_pattern_title)
364 |         similarities_excerpt = util.pytorch_cos_sim(emb_q, self.emb_pattern_excerpt)
365 | 
366 |         similarities_pattern_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_overflow)
367 |         similarities_title_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_title_overflow)
368 |         similarities_excerpt_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_excerpt_overflow)
369 | 
370 |         # Returning features which include the similarities, the hadamard product embeddings, and concatenations
371 |         return {
372 |             "similarities": (similarities_pattern, similarities_title, similarities_excerpt,
373 |                             similarities_pattern_overflow, similarities_title_overflow, similarities_excerpt_overflow),
374 |             "hadamard_products": (hadamard_emb_pattern, hadamard_emb_pattern_title, hadamard_emb_pattern_excerpt,
375 |                                   hadamard_emb_pattern_overflow, hadamard_emb_pattern_title_overflow, hadamard_emb_pattern_excerpt_overflow),
376 |             "concatenations": (concat_emb_pattern, concat_emb_pattern_title, concat_emb_pattern_excerpt,
377 |                               concat_emb_pattern_overflow, concat_emb_pattern_title_overflow, concat_emb_pattern_excerpt_overflow)
378 |         }
379 | 
380 | 
381 | 
382 |     def predict_class(self, texts, model, vectorizer):
383 |         loaded_model = pickle.load(open(model, 'rb'))
384 |         loaded_vect = pickle.load(open(vectorizer, 'rb'))
385 | 
386 |         text = [preprocess_text(t, preprocess_functions) for t in texts]
387 |         v_text = loaded_vect.transform(text)
388 | 
389 |         prediction = loaded_model.predict(v_text)
390 | 
391 |         return prediction
392 | 
393 |     def class_features(self, texts):
394 |       PARENT_FOLDER = "classification_model/"
395 | 
396 |       # BINARY CLASS
397 |       binary_prediction = self.predict_class(texts, PARENT_FOLDER + "binary_nb_model.sav", PARENT_FOLDER + "binary_vectorizer_model.sav")
398 | 
399 |       # MULTI CLASS
400 |       multi_prediction = self.predict_class(texts,PARENT_FOLDER + "multi_nb_model.sav",PARENT_FOLDER + "multi_vectorizer_model.sav")
401 | 
402 |       return binary_prediction, multi_prediction
403 | 


--------------------------------------------------------------------------------
/letor/feature_creation.py:
--------------------------------------------------------------------------------
  1 | import nltk
  2 | 
  3 | nltk.download('stopwords')
  4 | nltk.download('punkt')
  5 | 
  6 | import json, pickle
  7 | import numpy as np
  8 | from sklearn.feature_extraction.text import TfidfVectorizer
  9 | from scipy import sparse
 10 | import nltk
 11 | from nltk.corpus import stopwords
 12 | from nltk.tokenize import word_tokenize
 13 | from sentence_transformers import SentenceTransformer, util
 14 | from text_preprocessing import preprocess_text
 15 | from text_preprocessing import to_lower, remove_stopword, lemmatize_word
 16 | from transformers import BertTokenizer, BertModel
 17 | import torch
 18 | import os
 19 | import time
 20 | 
 21 | preprocess_functions = [to_lower, remove_stopword, lemmatize_word]
 22 | 
 23 | PARENT_FOLDER = ""
 24 | 
 25 | '''
 26 | Construct features for learning-to-rank
 27 | The main function is the construct_features(q) which receives input of a query (which in our case a requirement)
 28 | then it is computed for each pattern in privacypatterns.org
 29 | '''
 30 | 
 31 | class PrivacyPatternFeatures(object):
 32 |     def __init__(self):
 33 |         self.patterns, self.pattern_titles, self.pattern_excerpts = self.get_corpus_pattern()
 34 |         self.initiate_tf_idf()
 35 |         self.initiate_bm25(0.75, 1.6)
 36 | 
 37 |         print("Loading LTR Embeddings...")
 38 | 
 39 |         self.model_sentence_transformer = SentenceTransformer('all-mpnet-base-v2')
 40 |         self.model_sentence_transformer_overflow = SentenceTransformer('dean-ai/legal_heBERT_ft')
 41 | 
 42 |         self.emb_pattern_file = PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl'
 43 |         if os.path.isfile(self.emb_pattern_file):
 44 |             self.load_pattern_embeddings()
 45 |         else:
 46 |             self.precompute_pattern_embeddings()
 47 | 
 48 |     def construct_features(self, q):
 49 |         q_words = word_tokenize(self.remove_stopwords(q))
 50 | 
 51 |         # we adapt the representation from MSLR-WEB dataset
 52 |         # q is query that represents the requirements
 53 |         # pattern is the document
 54 |         # each query have the pattern features
 55 |         # query level feature = when the parameter only contain q
 56 | 
 57 |         len_q = len(self.remove_stopwords(q))
 58 |         idf_q = self.get_idf(q_words)
 59 |         tf_idf_q = self.tf_idf_features(q)
 60 |         bm25 = self.bm25(q)
 61 |         binary_q, multi_q = self.class_features([q])
 62 |         binary_pattern, multi_pattern = self.class_features(self.patterns)
 63 | 
 64 |         cosine_pattern, cosine_title, cosine_excerpt, cosine_pattern_overflow, cosine_title_overflow, cosine_excerpt_overflow = self.semantic_similarity_features(q)
 65 |         deep_semantic_features = self.deep_semantic_interaction_features(q)
 66 | 
 67 |         features_all = []
 68 |         for i, pattern in enumerate(self.patterns):
 69 |             features = []
 70 |             features.extend(self.number_of_covered_words(q_words, pattern)) # 1, 2
 71 |             features.append(len_q) # 3
 72 |             features.append(idf_q) # 4
 73 |             features.extend(self.tf_features(q_words, pattern)) # 5 - 14
 74 |             features.extend(tf_idf_q) # 15 - 19
 75 |             features.append(bm25[i]) # 20
 76 |             features.append(float(cosine_pattern[0][i])) # 21
 77 |             features.append(float(cosine_title[0][i])) # 22
 78 |             features.append(float(cosine_excerpt[0][i])) # 23
 79 |             features.append(float(cosine_pattern_overflow[0][i])) # 24
 80 |             features.append(float(cosine_title_overflow[0][i])) # 25
 81 |             features.append(float(cosine_excerpt_overflow[0][i])) # 26
 82 | 
 83 |             features.append(binary_q[0])
 84 |             features.append(multi_q[0])
 85 |             features.append(binary_pattern[i])
 86 |             features.append(multi_pattern[i])
 87 | 
 88 |             # Append deep semantic interaction features: similarities
 89 |             features.append(float(deep_semantic_features["similarities"][0][0][i])) # pattern similarity
 90 |             features.append(float(deep_semantic_features["similarities"][1][0][i])) # title similarity
 91 |             features.append(float(deep_semantic_features["similarities"][2][0][i])) # excerpt similarity
 92 |             features.append(float(deep_semantic_features["similarities"][3][0][i])) # pattern similarity (overflow model)
 93 |             features.append(float(deep_semantic_features["similarities"][4][0][i])) # title similarity (overflow model)
 94 |             features.append(float(deep_semantic_features["similarities"][5][0][i])) # excerpt similarity (overflow model)
 95 | 
 96 |             # Append deep semantic interaction features: hadamard products
 97 |             features.extend(deep_semantic_features["hadamard_products"][0][i].tolist()) # pattern
 98 |             features.extend(deep_semantic_features["hadamard_products"][1][i].tolist()) # title
 99 |             features.extend(deep_semantic_features["hadamard_products"][2][i].tolist()) # excerpt
100 |             features.extend(deep_semantic_features["hadamard_products"][3][i].tolist()) # pattern (overflow model)
101 |             features.extend(deep_semantic_features["hadamard_products"][4][i].tolist()) # title (overflow model)
102 |             features.extend(deep_semantic_features["hadamard_products"][5][i].tolist()) # excerpt (overflow model)
103 | 
104 |             # # Append deep semantic interaction features: concatenation
105 |             features.extend(deep_semantic_features["concatenations"][0][i].tolist()) # pattern
106 |             features.extend(deep_semantic_features["concatenations"][1][i].tolist()) # title
107 |             features.extend(deep_semantic_features["concatenations"][2][i].tolist()) # excerpt
108 |             features.extend(deep_semantic_features["concatenations"][3][i].tolist()) # pattern (overflow model)
109 |             features.extend(deep_semantic_features["concatenations"][4][i].tolist()) # title (overflow model)
110 |             features.extend(deep_semantic_features["concatenations"][5][i].tolist()) # excerpt (overflow model)
111 | 
112 |             features_all.append(features)
113 | 
114 |         return features_all
115 | 
116 |     def get_corpus_pattern(self):
117 |         pattern_file= PARENT_FOLDER + "patterns.json"
118 |         X = []
119 |         title = []
120 |         excerpt = []
121 |         with open(pattern_file, 'r') as p:
122 |             patterns = json.loads(p.read())
123 | 
124 |         for pattern in patterns:
125 |             text = ""
126 | 
127 |             filename = pattern["filename"].replace(".md","").replace("-"," ")
128 | 
129 |             title.append(filename)
130 |             excerpt.append(pattern["excerpt"].strip())
131 | 
132 |             text += filename
133 |             if not text.endswith("."):
134 |                 text += ". "
135 | 
136 |             text += pattern["excerpt"].strip()
137 |             if not text.endswith("."):
138 |                 text += ". "
139 | 
140 |             for heading in pattern["heading"]:
141 |                 text += heading["content"].strip()
142 |                 if not text.endswith("."):
143 |                     text += ". "
144 | 
145 |             X.append(text)
146 | 
147 |         X_new, title_new, excerpt_new = self.get_new_patterns()
148 | 
149 |         X.extend(X_new)
150 |         title.extend(title_new)
151 |         excerpt.extend(excerpt_new)
152 | 
153 |         return X, title, excerpt
154 | 
155 | 
156 |     def get_new_patterns(self):
157 |       pattern_file= PARENT_FOLDER + "patterns_new.json"
158 |       X = []
159 |       title = []
160 |       excerpt = []
161 |       with open(pattern_file, 'r') as p:
162 |           patterns = json.loads(p.read())
163 | 
164 |           for pattern in patterns:
165 |             X.append(pattern["description"])
166 |             title.append(pattern["title"])
167 |             excerpt.append(pattern["excerpt"])
168 | 
169 |       return X, title, excerpt
170 | 
171 |     def remove_stopwords(self, q):
172 |         stop_words = set(stopwords.words('english'))
173 |         word_tokens = word_tokenize(q)
174 |         filtered_sentence = " ".join([w for w in word_tokens if not w.lower() in stop_words])
175 | 
176 |         return filtered_sentence
177 | 
178 | 
179 |     def initiate_tf_idf(self):
180 |         self.tf_idf_vectorizer = TfidfVectorizer(norm=None, smooth_idf=False)
181 |         self.tf_idf_vectorizer.fit(self.patterns)
182 |         self.tf_idf_feature_names = self.tf_idf_vectorizer.get_feature_names_out()
183 | 
184 | 
185 |     def initiate_bm25(self, b, k1):
186 |         self.b = b
187 |         self.k1 = k1
188 | 
189 |         y = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(self.patterns)
190 |         self.avdl = y.sum(1).mean()
191 | 
192 | 
193 |     def bm25(self, q):
194 |         X = self.patterns
195 |         """ Calculate BM25 between query q and documents X """
196 |         b, k1, avdl = self.b, self.k1, self.avdl
197 | 
198 |         # apply CountVectorizer
199 |         X = super(TfidfVectorizer, self.tf_idf_vectorizer).transform(X)
200 |         len_X = X.sum(1).A1
201 |         q, = super(TfidfVectorizer, self.tf_idf_vectorizer).transform([q])
202 |         assert sparse.isspmatrix_csr(q)
203 | 
204 |         # convert to csc for better column slicing
205 |         X = X.tocsc()[:, q.indices]
206 |         denom = X + (k1 * (1 - b + b * len_X / avdl))[:, None]
207 |         # idf(t) = log [ n / df(t) ] + 1 in sklearn, so it need to be coneverted
208 |         # to idf(t) = log [ n / df(t) ] with minus 1
209 |         idf = self.tf_idf_vectorizer._tfidf.idf_[None, q.indices] - 1.
210 |         numer = X.multiply(np.broadcast_to(idf, X.shape)) * (k1 + 1)
211 |         return (numer / denom).sum(1).A1
212 | 
213 |     def number_of_covered_words(self, q_words, pattern):
214 |         # How many terms in the user query are covered by the text.
215 |         # ration = Covered query term number divided by the number of query terms.
216 | 
217 |         n = 0
218 |         for word in q_words:
219 |             if word.lower() in pattern.lower():
220 |                 n += 1
221 | 
222 |         ratio = n/len(q_words)
223 |         return [n, ratio]
224 | 
225 |     def get_idf(self, q_words):
226 |         # 1 divided by the number of documents containing the query terms.
227 | 
228 |         n = 0
229 |         word_in_patterns = set()
230 |         for pattern in self.patterns:
231 |             for word in q_words:
232 |                 if word.lower() in pattern.lower():
233 |                     word_in_patterns.add(word.lower())
234 | 
235 |         if len(list(word_in_patterns)) == 0:
236 |           return 0
237 | 
238 |         idf = 1/len(list(word_in_patterns))
239 | 
240 |         return idf
241 | 
242 |     def tf_features(self, q_words, pattern):
243 |         # Sum, Min, Max, Average, Variance of counts of each query term in the document.
244 |         # Normalized version : term counts divided by text length
245 | 
246 |         pattern_words = word_tokenize(pattern)
247 |         total_len = len(pattern_words)
248 |         n_count_all = [pattern_words.count(word) for word in q_words]
249 | 
250 |         tf_sum, tf_min, tf_max, tf_avg, tf_var = sum(n_count_all), min(n_count_all), max(n_count_all), np.average(n_count_all), np.var(n_count_all)
251 | 
252 |         norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var = sum(n_count_all)/total_len, min(n_count_all)/total_len, max(n_count_all)/total_len, np.average(n_count_all)/total_len, np.var(n_count_all)/float(total_len)
253 | 
254 |         return [tf_sum, tf_min, tf_max, tf_avg, tf_var, norm_tf_sum, norm_tf_min, norm_tf_max, norm_tf_avg, norm_tf_var]
255 | 
256 | 
257 |     def tf_idf_features(self, q):
258 |         tfidf_matrix= self.tf_idf_vectorizer.transform([q]).todense()
259 |         feature_index = tfidf_matrix[0,:].nonzero()[1]
260 |         tfidf_scores = zip([self.tf_idf_feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
261 | 
262 |         word_scores = [score for score in dict(tfidf_scores).values()]
263 | 
264 |         tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var = sum(word_scores), min(word_scores), max(word_scores), np.average(word_scores), np.var(word_scores)
265 | 
266 |         return [tfidf_sum, tfidf_min, tfidf_max, tfidf_avg, tfidf_var]
267 | 
268 |     def load_pattern_embeddings(self):
269 |         # Load the embeddings from the saved files
270 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl', 'rb') as f:
271 |             self.emb_pattern = pickle.load(f)
272 | 
273 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title.pkl', 'rb') as f:
274 |             self.emb_pattern_title = pickle.load(f)
275 | 
276 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt.pkl', 'rb') as f:
277 |             self.emb_pattern_excerpt = pickle.load(f)
278 | 
279 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_overflow.pkl', 'rb') as f:
280 |             self.emb_pattern_overflow = pickle.load(f)
281 | 
282 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title_overflow.pkl', 'rb') as f:
283 |             self.emb_pattern_title_overflow = pickle.load(f)
284 | 
285 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt_overflow.pkl', 'rb') as f:
286 |             self.emb_pattern_excerpt_overflow = pickle.load(f)
287 | 
288 |     def precompute_pattern_embeddings(self):
289 |         print("Precompute Pattern Embeddings")
290 |         # Compute embeddings for patterns
291 |         self.emb_pattern = self.model_sentence_transformer.encode(self.patterns, convert_to_tensor=True)
292 |         self.emb_pattern_title = self.model_sentence_transformer.encode(self.pattern_titles, convert_to_tensor=True)
293 |         self.emb_pattern_excerpt = self.model_sentence_transformer.encode(self.pattern_excerpts, convert_to_tensor=True)
294 | 
295 |         self.emb_pattern_overflow = self.model_sentence_transformer_overflow.encode(self.patterns, convert_to_tensor=True)
296 |         self.emb_pattern_title_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_titles, convert_to_tensor=True)
297 |         self.emb_pattern_excerpt_overflow = self.model_sentence_transformer_overflow.encode(self.pattern_excerpts, convert_to_tensor=True)
298 | 
299 |         # Save the embeddings for later use
300 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern.pkl', 'wb') as f:
301 |             pickle.dump(self.emb_pattern, f)
302 | 
303 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title.pkl', 'wb') as f:
304 |             pickle.dump(self.emb_pattern_title, f)
305 | 
306 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt.pkl', 'wb') as f:
307 |             pickle.dump(self.emb_pattern_excerpt, f)
308 | 
309 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_overflow.pkl', 'wb') as f:
310 |             pickle.dump(self.emb_pattern_overflow, f)
311 | 
312 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_title_overflow.pkl', 'wb') as f:
313 |             pickle.dump(self.emb_pattern_title_overflow, f)
314 | 
315 |         with open(PARENT_FOLDER + 'LTR_resources/emb_pattern_excerpt_overflow.pkl', 'wb') as f:
316 |             pickle.dump(self.emb_pattern_excerpt_overflow, f)
317 | 
318 |     def semantic_similarity_features(self, q):
319 |         emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True)
320 | 
321 |         cosine_scores_pattern = util.cos_sim(emb_q, self.emb_pattern)
322 |         cosine_scores_title = util.cos_sim(emb_q, self.emb_pattern_title)
323 |         cosine_scores_excerpt = util.cos_sim(emb_q, self.emb_pattern_excerpt)
324 | 
325 |         emb_q = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True)
326 | 
327 |         cosine_scores_pattern_overflow = util.cos_sim(emb_q, self.emb_pattern_overflow)
328 |         cosine_scores_title_overflow = util.cos_sim(emb_q, self.emb_pattern_title_overflow)
329 |         cosine_scores_excerpt_overflow = util.cos_sim(emb_q, self.emb_pattern_excerpt_overflow)
330 | 
331 |         return cosine_scores_pattern, cosine_scores_title, cosine_scores_excerpt, cosine_scores_pattern_overflow, cosine_scores_title_overflow, cosine_scores_excerpt_overflow
332 | 
333 |     def hadamard_product(self, tensor1, tensor2):
334 |         return tensor1 * tensor2
335 | 
336 |     def deep_semantic_interaction_features(self, q):
337 |         # Encode the query
338 |         emb_q = self.model_sentence_transformer.encode(q, convert_to_tensor=True)
339 |         emb_q_overflow = self.model_sentence_transformer_overflow.encode(q, convert_to_tensor=True)
340 | 
341 |         # Compute the "ideal" similarity, which is the query with itself
342 |         ideal_emb = self.model_sentence_transformer.encode([q + " [SEP] " + q], convert_to_tensor=True)
343 |         ideal_emb_overflow = self.model_sentence_transformer_overflow.encode([q + " [SEP] " + q], convert_to_tensor=True)
344 | 
345 |         # Compute Hadamard product between query and pattern embeddings
346 |         hadamard_emb_pattern = self.hadamard_product(emb_q, self.emb_pattern)
347 |         hadamard_emb_pattern_title = self.hadamard_product(emb_q, self.emb_pattern_title)
348 |         hadamard_emb_pattern_excerpt = self.hadamard_product(emb_q, self.emb_pattern_excerpt)
349 | 
350 |         hadamard_emb_pattern_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_overflow)
351 |         hadamard_emb_pattern_title_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_title_overflow)
352 |         hadamard_emb_pattern_excerpt_overflow = self.hadamard_product(emb_q_overflow, self.emb_pattern_excerpt_overflow)
353 | 
354 |         # Compute Concatenation between query and pattern embeddings
355 |         concat_emb_pattern = torch.cat((emb_q.unsqueeze(0), self.emb_pattern), dim=0)
356 |         concat_emb_pattern_title = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_title), dim=0)
357 |         concat_emb_pattern_excerpt = torch.cat((emb_q.unsqueeze(0), self.emb_pattern_excerpt), dim=0)
358 | 
359 |         concat_emb_pattern_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_overflow), dim=0)
360 |         concat_emb_pattern_title_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_title_overflow), dim=0)
361 |         concat_emb_pattern_excerpt_overflow = torch.cat((emb_q_overflow.unsqueeze(0), self.emb_pattern_excerpt_overflow), dim=0)
362 | 
363 |         # Compute similarity between the query embeddings and the precomputed pattern embeddings
364 |         similarities_pattern = util.pytorch_cos_sim(emb_q, self.emb_pattern)
365 |         similarities_title = util.pytorch_cos_sim(emb_q, self.emb_pattern_title)
366 |         similarities_excerpt = util.pytorch_cos_sim(emb_q, self.emb_pattern_excerpt)
367 | 
368 |         similarities_pattern_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_overflow)
369 |         similarities_title_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_title_overflow)
370 |         similarities_excerpt_overflow = util.pytorch_cos_sim(emb_q_overflow, self.emb_pattern_excerpt_overflow)
371 | 
372 |         # Returning features which include the similarities, the hadamard product embeddings, and concatenations
373 |         return {
374 |             "similarities": (similarities_pattern, similarities_title, similarities_excerpt,
375 |                             similarities_pattern_overflow, similarities_title_overflow, similarities_excerpt_overflow),
376 |             "hadamard_products": (hadamard_emb_pattern, hadamard_emb_pattern_title, hadamard_emb_pattern_excerpt,
377 |                                   hadamard_emb_pattern_overflow, hadamard_emb_pattern_title_overflow, hadamard_emb_pattern_excerpt_overflow),
378 |             "concatenations": (concat_emb_pattern, concat_emb_pattern_title, concat_emb_pattern_excerpt,
379 |                               concat_emb_pattern_overflow, concat_emb_pattern_title_overflow, concat_emb_pattern_excerpt_overflow)
380 |         }
381 | 
382 | 
383 | 
384 |     def predict_class(self, texts, model, vectorizer):
385 |         loaded_model = pickle.load(open(model, 'rb'))
386 |         loaded_vect = pickle.load(open(vectorizer, 'rb'))
387 | 
388 |         text = [preprocess_text(t, preprocess_functions) for t in texts]
389 |         v_text = loaded_vect.transform(text)
390 | 
391 |         prediction = loaded_model.predict(v_text)
392 | 
393 |         return prediction
394 | 
395 |     def class_features(self, texts):
396 |       PARENT_FOLDER = ""
397 | 
398 |       # BINARY CLASS
399 |       binary_prediction = self.predict_class(texts, PARENT_FOLDER + "binary_nb_model.sav", PARENT_FOLDER + "binary_vectorizer_model.sav")
400 | 
401 |       # MULTI CLASS
402 |       multi_prediction = self.predict_class(texts,PARENT_FOLDER + "multi_nb_model.sav",PARENT_FOLDER + "multi_vectorizer_model.sav")
403 | 
404 |       return binary_prediction, multi_prediction
405 | 
406 | def process_fold_data(fold_type, fold_num, pattern_file_path, base_path, cache={}):
407 |     """
408 |     Processes the given fold data (train or test) and writes the output to a file.
409 | 
410 |     Parameters:
411 |     - fold_type (str): Either 'train' or 'test'.
412 |     - fold_num (int): The fold number (1-5).
413 |     - pattern_file_path (str): Path to the patterns file.
414 |     - base_path (str): Base path for input and output files.
415 |     """
416 |     pp = PrivacyPatternFeatures()
417 | 
418 |     with open(pattern_file_path, 'r') as p:
419 |         patterns = json.loads(p.read())
420 | 
421 |     pattern_name = [pattern["title"].replace(".md", "") for i, pattern in enumerate(patterns)]
422 | 
423 |     with open(base_path + f"{fold_type}_patterns_req_v2_fold_{fold_num}.json", 'r', encoding="utf-8") as p:
424 |         patterns_requirements = json.loads(p.read())
425 | 
426 |     lines = []
427 |     for pr in patterns_requirements:
428 |         print("Query_Id", pr["id"])
429 | 
430 |         # Check if features are already calculated for this pr["id"]
431 |         if pr["id"] not in cache:
432 |             cache[pr["id"]] = pp.construct_features(pr["req_text"])
433 | 
434 |         features = cache[pr["id"]]
435 | 
436 |         for i_pattern, p in enumerate(pr["pattern"]):
437 |             idx = pattern_name.index(p["name"])
438 | 
439 |             line = ""
440 |             line += "{} qid:{}".format(p["rating"], pr["id"])
441 | 
442 |             for i_feature, val in enumerate(features[idx]):
443 |                 line += " {}:{}".format(i_feature+1, val)
444 | 
445 |             line += " #docid={}".format(p["name"])
446 |             lines.append(line)
447 | 
448 |     # Save the processed lines to a file specific to the current fold and type (train/test)
449 |     with open(base_path + f"{fold_type}_fold_{fold_num}.txt", "w") as f:
450 |         for l in lines:
451 |             f.write(l + "\n")
452 | 
453 | 
454 | # Example usage:
455 | base_path = ""
456 | pattern_file = base_path + "patterns.json"
457 | 
458 | # Initialize a cache dictionary to store features
459 | features_cache = {}
460 | 
461 | # Process all 5 folds for both training and testing data
462 | for fold_num in range(1, 6):
463 |     process_fold_data('train', fold_num, pattern_file, base_path, features_cache)
464 |     process_fold_data('test', fold_num, pattern_file, base_path, features_cache)
465 | 


--------------------------------------------------------------------------------