├── .gitignore ├── .gitattributes ├── GraphOfDocs ├── images │ ├── settings.jpg │ ├── GraphofDocs.jpg │ └── feature_selection.jpg ├── evaluation_results │ ├── image_20news_jira.jpg │ ├── reuters │ │ ├── REUTERS_feature_selection_2NN.jpg │ │ ├── REUTERS_feature_selection_5NN.jpg │ │ ├── REUTERS_feature_selection_LR.jpg │ │ ├── REUTERS_feature_selection_NB.jpg │ │ ├── REUTERS_feature_selection_1KNN.jpg │ │ ├── REUTERS_feature_selection_LSVM.jpg │ │ ├── REUTERS_feature_selection_1KNN_0_1.jpg │ │ ├── REUTERS_feature_selection_2NN_0_1.jpg │ │ ├── REUTERS_feature_selection_5NN_0_1.jpg │ │ ├── REUTERS_feature_selection_LR_0_1.jpg │ │ ├── REUTERS_feature_selection_LSVM_0_1.jpg │ │ ├── REUTERS_feature_selection_NB_0_1.jpg │ │ ├── REUTERS_feature_selection_NN100x50.jpg │ │ ├── REUTERS_feature_selection_NN100x50_0_1.jpg │ │ └── REUTERS_evaluation_results.csv │ ├── lingspam │ │ ├── LINGSPAM_feature_selection_2NN.jpg │ │ ├── LINGSPAM_feature_selection_5NN.jpg │ │ ├── LINGSPAM_feature_selection_LR.jpg │ │ ├── LINGSPAM_feature_selection_NB.jpg │ │ ├── LINGSPAM_feature_selection_1KNN.jpg │ │ ├── LINGSPAM_feature_selection_LR_0_1.jpg │ │ ├── LINGSPAM_feature_selection_LSVM.jpg │ │ ├── LINGSPAM_feature_selection_NB_0_1.jpg │ │ ├── LINGSPAM_feature_selection_1KNN_0_1.jpg │ │ ├── LINGSPAM_feature_selection_2NN_0_1.jpg │ │ ├── LINGSPAM_feature_selection_5NN_0_1.jpg │ │ ├── LINGSPAM_feature_selection_LSVM_0_1.jpg │ │ ├── LINGSPAM_feature_selection_NN100x50.jpg │ │ ├── LINGSPAM_feature_selection_NN100x50_0_1.jpg │ │ ├── LINGSPAM_evaluation_results.csv │ │ └── lingspam_results.txt │ ├── amazon_sentiment │ │ ├── AMAZON_feature_selection_LR.jpg │ │ ├── AMAZON_feature_selection_NB.jpg │ │ ├── AMAZON_feature_selection_1KNN.jpg │ │ ├── AMAZON_feature_selection_2NN.jpg │ │ ├── AMAZON_feature_selection_5NN.jpg │ │ ├── AMAZON_feature_selection_LSVM.jpg │ │ ├── AMAZON_feature_selection_2NN_0_1.jpg │ │ ├── AMAZON_feature_selection_5NN_0_1.jpg │ │ ├── AMAZON_feature_selection_LR_0_1.jpg │ │ ├── AMAZON_feature_selection_NB_0_1.jpg │ │ ├── AMAZON_feature_selection_1KNN_0_1.jpg │ │ ├── AMAZON_feature_selection_LSVM_0_1.jpg │ │ ├── AMAZON_feature_selection_NN100x50.jpg │ │ ├── AMAZON_feature_selection_NN100x50_0_1.jpg │ │ └── AMAZON_evaluation_results.csv │ ├── jira_issues │ │ ├── JIRAISSUES_feature_selection_2NN.jpg │ │ ├── JIRAISSUES_feature_selection_5NN.jpg │ │ ├── JIRAISSUES_feature_selection_LR.jpg │ │ ├── JIRAISSUES_feature_selection_NB.jpg │ │ ├── JIRAISSUES_feature_selection_1KNN.jpg │ │ ├── JIRAISSUES_feature_selection_LR_0_1.jpg │ │ ├── JIRAISSUES_feature_selection_LSVM.jpg │ │ ├── JIRAISSUES_feature_selection_NB_0_1.jpg │ │ ├── JIRAISSUES_feature_selection_1KNN_0_1.jpg │ │ ├── JIRAISSUES_feature_selection_2NN_0_1.jpg │ │ ├── JIRAISSUES_feature_selection_5NN_0_1.jpg │ │ ├── JIRAISSUES_feature_selection_LSVM_0_1.jpg │ │ ├── JIRAISSUES_feature_selection_NN100x50.jpg │ │ ├── JIRAISSUES_feature_selection_NN100x50_0_1.jpg │ │ └── JIRAISSUES_evaluation_results.csv │ ├── 20newsgroups │ │ ├── 20NEWSGROUPS_feature_selection_2NN.jpg │ │ ├── 20NEWSGROUPS_feature_selection_5NN.jpg │ │ ├── 20NEWSGROUPS_feature_selection_LR.jpg │ │ ├── 20NEWSGROUPS_feature_selection_NB.jpg │ │ ├── 20NEWSGROUPS_feature_selection_1KNN.jpg │ │ ├── 20NEWSGROUPS_feature_selection_LSVM.jpg │ │ ├── 20NEWSGROUPS_feature_selection_1KNN_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_2NN_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_5NN_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_LR_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_LSVM_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_NB_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_NN100x50.jpg │ │ ├── 20NEWSGROUPS_feature_selection_NN500x250.jpg │ │ ├── 20NEWSGROUPS_feature_selection_NN100x50_0_1.jpg │ │ ├── 20NEWSGROUPS_feature_selection_NN500x250_0_1.jpg │ │ └── 20NEWSGROUPS_evaluation_results.csv │ └── amazon_categories │ │ ├── AMAZON_feature_selection_1KNN.jpg │ │ ├── AMAZON_feature_selection_2NN.jpg │ │ ├── AMAZON_feature_selection_5NN.jpg │ │ ├── AMAZON_feature_selection_LR.jpg │ │ ├── AMAZON_feature_selection_LSVM.jpg │ │ ├── AMAZON_feature_selection_NB.jpg │ │ ├── AMAZON_feature_selection_LR_0_1.jpg │ │ ├── AMAZON_feature_selection_NB_0_1.jpg │ │ ├── AMAZON_feature_selection_1KNN_0_1.jpg │ │ ├── AMAZON_feature_selection_2NN_0_1.jpg │ │ ├── AMAZON_feature_selection_5NN_0_1.jpg │ │ ├── AMAZON_feature_selection_LSVM_0_1.jpg │ │ ├── AMAZON_feature_selection_NN100x50.jpg │ │ ├── AMAZON_feature_selection_NN100x50_0_1.jpg │ │ └── AMAZON_evaluation_results.csv ├── __init__.py ├── web │ └── custom │ │ ├── custom.css │ │ └── custom.js ├── neo4j_wrapper.py ├── config_experiments.py ├── parse_args.py ├── select.py ├── algos.py ├── visualize.html ├── parse_news.py ├── parse_issues.py ├── parse_reviews.py ├── utils.py ├── create.py └── evaluation.py ├── requirements.txt ├── GraphOfDocs.py ├── README.md ├── experiments.py └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | env/ 2 | __pycache__/ 3 | Virtual Environment/ -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | GraphOfDocs/web/vendor/* linguist-vendored 2 | -------------------------------------------------------------------------------- /GraphOfDocs/images/settings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/images/settings.jpg -------------------------------------------------------------------------------- /GraphOfDocs/images/GraphofDocs.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/images/GraphofDocs.jpg -------------------------------------------------------------------------------- /GraphOfDocs/images/feature_selection.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/images/feature_selection.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/image_20news_jira.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/image_20news_jira.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_2NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_2NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_5NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_5NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LR.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NB.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NB.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_2NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_2NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_5NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_5NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LR.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NB.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NB.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_1KNN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_1KNN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LSVM.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_1KNN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_1KNN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LR_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LR_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LSVM.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NB_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NB_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_1KNN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_1KNN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_2NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_2NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_5NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_5NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LR_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LR_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LSVM_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_LSVM_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NB_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NB_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NN100x50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NN100x50.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LR.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NB.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NB.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_2NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_2NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_5NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_5NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LR.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NB.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NB.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_1KNN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_1KNN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_2NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_2NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_5NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_5NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LSVM_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_LSVM_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NN100x50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NN100x50.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_2NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_2NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_5NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_5NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LR.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NB.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NB.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_1KNN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_1KNN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_2NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_2NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_5NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_5NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LR.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LR.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LSVM.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NB.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NB.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_1KNN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_1KNN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_2NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_2NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_5NN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_5NN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LSVM.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_1KNN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_1KNN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LR_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LR_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LSVM.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NB_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NB_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NN100x50_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/reuters/REUTERS_feature_selection_NN100x50_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_1KNN.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_1KNN.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LSVM.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LSVM.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LR_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LR_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NB_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NB_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_2NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_2NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_5NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_5NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LR_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LR_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NB_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NB_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_1KNN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_1KNN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_2NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_2NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_5NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_5NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LSVM_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_LSVM_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NN100x50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NN100x50.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NN100x50_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/lingspam/LINGSPAM_feature_selection_NN100x50_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_1KNN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_1KNN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_2NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_2NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_5NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_5NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LR_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LR_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LSVM_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_LSVM_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NB_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NB_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN100x50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN100x50.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_1KNN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_1KNN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_2NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_2NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_5NN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_5NN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LSVM_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_LSVM_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NN100x50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NN100x50.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_1KNN_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_1KNN_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LSVM_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_LSVM_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NN100x50.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NN100x50.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN500x250.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN500x250.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NN100x50_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_feature_selection_NN100x50_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NN100x50_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_feature_selection_NN100x50_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN100x50_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN100x50_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN500x250_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_feature_selection_NN500x250_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NN100x50_0_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NC0DER/GraphOfDocs/HEAD/GraphOfDocs/evaluation_results/amazon_categories/AMAZON_feature_selection_NN100x50_0_1.jpg -------------------------------------------------------------------------------- /GraphOfDocs/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'Neo4jDatabase', 3 | 'create_graph_of_words', 4 | 'run_initial_algorithms', 5 | 'create_similarity_graph', 6 | 'create_clustering_tags', 7 | 'generate_words', 8 | 'read_datasets', 9 | 'clear_screen', 10 | 'parser' 11 | ] 12 | -------------------------------------------------------------------------------- /GraphOfDocs/web/custom/custom.css: -------------------------------------------------------------------------------- 1 | body, html { 2 | height: 100%; 3 | max-width: 100%; 4 | overflow-x: hidden; 5 | font-family: Consolas; 6 | } 7 | 8 | #viz { 9 | width: 100%; 10 | height: 700px; 11 | margin-top: 20px; 12 | border: 1px solid #ced4da; 13 | border-radius: .25rem; 14 | margin-bottom: 10px; 15 | } 16 | 17 | .form-group { 18 | min-width: 100%; 19 | } 20 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | joblib==0.14.1 3 | kiwisolver==1.1.0 4 | matplotlib==3.2.0 5 | neo4j==1.7.6 6 | neobolt==1.7.16 7 | neotime==1.7.4 8 | nltk==3.4.5 9 | numpy==1.18.1 10 | pandas==1.0.1 11 | pip==21.1 12 | prettytable==0.7.2 13 | pyparsing==2.4.6 14 | python-dateutil==2.8.1 15 | pytz==2019.3 16 | scikit-learn==0.22.2.post1 17 | scipy==1.4.1 18 | seaborn==0.10.0 19 | setuptools==45.3.0 20 | singledispatch==3.4.0.3 21 | six==1.14.0 22 | -------------------------------------------------------------------------------- /GraphOfDocs/neo4j_wrapper.py: -------------------------------------------------------------------------------- 1 | from neo4j import GraphDatabase, CypherError, ServiceUnavailable 2 | 3 | class Neo4jDatabase(object): 4 | """ 5 | Wrapper class to handle the database 6 | more efficiently, by abstracting repeating code. 7 | """ 8 | def __init__(self, uri, user, password): # Open the database and authenticate. 9 | self._driver = GraphDatabase.driver(uri, auth=(user, password)) 10 | 11 | def close(self): 12 | self._driver.close() 13 | 14 | def execute(self, query, mode): # Execute queries in the database. 15 | with self._driver.session() as session: 16 | if (mode == 'r'): # Reading query. 17 | result = session.read_transaction(self.__execute, query) 18 | elif(mode == 'w'): # Writing query. 19 | result = session.write_transaction(self.__execute, query) 20 | else: 21 | raise TypeError('Execution mode can either be (r)ead or (w)rite!') 22 | return result 23 | 24 | @staticmethod # private method. 25 | def __execute(tx, query): 26 | result = tx.run(query) 27 | try: 28 | return result.values() # Return node, relationship values in a list of tuples. 29 | except CypherError as err: pass # Handle the erroneous query instead of breaking the execution. 30 | -------------------------------------------------------------------------------- /GraphOfDocs/config_experiments.py: -------------------------------------------------------------------------------- 1 | from sklearn.naive_bayes import MultinomialNB 2 | from sklearn.linear_model import LogisticRegression 3 | from sklearn.neural_network import MLPClassifier 4 | from sklearn.neighbors import KNeighborsClassifier 5 | from sklearn.svm import LinearSVC 6 | 7 | MIN_NUMBER_OF_DOCUMENTS_PER_SELECTED_COMMUNITY = 2 8 | DATASET_PATH = \ 9 | r'C:\Users\USER\source\repos\GraphOfDocs\GraphOfDocs\datasets\amazon' 10 | 11 | PLOTS_PREFIX = 'AMAZON' 12 | EXPERIMENTAL_RESULTS_OUΤPUT_DIR = \ 13 | r'C:\Users\USER\source\repos\GraphOfDocs\GraphOfDocs\experimental_results\amazon' 14 | 15 | # Feature selection 16 | VARIANCE_THRESHOLD = [0.0005, 0.001, 0.0015, 0.002, 0.003, 0.004, 0.005, 0.01] 17 | SELECT_KBEST_K = [350, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000] 18 | 19 | # Graph of docs feature selection. 20 | # Create a vocabulary with the TOP N words of each community of docs 21 | TOP_N_SELECTED_COMMUNITY_TERMS = [5, 10, 15, 20, 25, 50, 100, 250, 500] 22 | 23 | #VARIANCE_THRESHOLD = [0.0005] 24 | #SELECT_KBEST_K = [1000] 25 | #TOP_N_SELECTED_COMMUNITY_TERMS = [5] 26 | 27 | classifiers = [ 28 | ('NB', MultinomialNB()), 29 | ('LR', LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')), 30 | ('5NN', KNeighborsClassifier(n_neighbors=5, weights='distance')), 31 | ('2NN', KNeighborsClassifier(n_neighbors=2, weights='distance')), 32 | ('1KNN', KNeighborsClassifier(n_neighbors=1, weights='distance')), 33 | ('LSVM', LinearSVC()), 34 | ('NN100x50', MLPClassifier(solver='adam', hidden_layer_sizes=(100, 50), random_state=42)), 35 | #('NN500x250', MLPClassifier(solver='adam', hidden_layer_sizes=(500, 250), random_state=42)), 36 | ] 37 | 38 | def extract_file_class(filename): 39 | return filename.split('_')[0].split('.')[1] 40 | -------------------------------------------------------------------------------- /GraphOfDocs/parse_args.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | """This script contains code for the command line argument parser.""" 3 | parser = argparse.ArgumentParser(description = 'Create, reinitialize, analyze the graphofdocs model') 4 | parser.add_argument('-c', '--create', action = 'store_true', 5 | help = 'set this flag to create and initialize the graphofdocs model') 6 | 7 | parser.add_argument('-r', '--reinitialize', action = 'store_true', 8 | help = 'set this flag to reinitialize the graphofdocs model, ' 9 | 'by re-running centrality, community detection and similarity algorithms') 10 | 11 | parser.add_argument('-dir', '--dirpath', nargs = 1, type = str, 12 | help = 'if create is set, ' 13 | 'then specify a directory path, ' 14 | 'containing plaintext files (documents).') 15 | 16 | parser.add_argument('-ws', '--window-size', nargs = 1, type = int, 17 | default = [4], choices = [2, 3, 4, 5, 6], 18 | help = 'if create is set, then set the window size') 19 | 20 | parser.add_argument('-e', '--extend-window', action = 'store_true', 21 | help = 'if create is set, then set this flag to ' 22 | 'enable the sliding text window to extend ' 23 | 'over words of different sentences. ' 24 | '(Default behavior: Disabled)') 25 | 26 | parser.add_argument('-is', '--insert-stopwords', action = 'store_true', 27 | help = 'if create is set, then set this flag to ' 28 | 'enable the insertion of stopwords from the text to the model. ' 29 | '(Default behavior: Disabled)') 30 | 31 | parser.add_argument('-l', '--lemmatize', action = 'store_true', 32 | help = 'if create is set, then set this flag to ' 33 | 'enable the lemmatization of terms of the text. ' 34 | '(Default behavior: Disabled)') 35 | 36 | parser.add_argument('-s', '--stem', action = 'store_true', 37 | help = 'if create is set, then set this flag to ' 38 | 'enable the stemming of terms of the text. ' 39 | '(Default behavior: Disabled)') 40 | -------------------------------------------------------------------------------- /GraphOfDocs/select.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains functions that 3 | select data from the Neo4j database. 4 | """ 5 | 6 | def get_communities_filenames(database): 7 | """ 8 | This function retrieves all filenames (and the file count) 9 | for every community of similar documents. 10 | """ 11 | query = ('MATCH (d:Document) RETURN d.community, ' 12 | 'collect(d.filename) AS files, ' 13 | 'count(d.filename) AS file_count ' 14 | 'ORDER BY file_count DESC') 15 | results = database.execute(query, 'r') 16 | return results 17 | 18 | def get_communities_tags(database, top_terms = None): 19 | """ 20 | This function generates the most important terms that describe 21 | each community of similar documents, and returns them for all communities. 22 | """ 23 | # Get all intersecting nodes of the speficied community, 24 | # ranked by their in-degree (which shows to how many documents they belong to). 25 | # and pagerank score in descending order. 26 | top_tags = {} 27 | query = ('MATCH p=((d:Document)-[:includes]->(w:Word)) ' 28 | 'WITH d.community as community, w, count(p) as degree ' 29 | 'WHERE degree > 1 ' 30 | 'WITH community as com, w.key as word, w.pagerank as pagerank, degree as deg ' 31 | 'ORDER BY com, deg DESC, pagerank DESC ' 32 | 'RETURN com, collect([word, pagerank, deg])') 33 | communities = database.execute(query, 'r') 34 | 35 | # Get the top tags from the tags and scores list. 36 | for [community, tags_scores] in communities: 37 | # Get all top terms for this community. 38 | if top_terms is None: 39 | top_tags[community] = [tag[0] for tag in tags_scores] 40 | else: 41 | top_tags[community] = [tag[0] for tag in tags_scores[:top_terms]] 42 | return top_tags 43 | 44 | def get_word_digrams_by_filename(database, filename): 45 | query = (f'MATCH (d:Document {{filename: "{filename}"}})' 46 | '-[:includes]->(w1:Word)-[r:connects]->(w2:Word)' 47 | '<-[:includes]-(d) WHERE id(w1) < id(w2) ' 48 | 'WITH w1.key AS source, w2.key AS target, r.weight AS weight ' 49 | 'ORDER BY weight DESC RETURN collect([source, target, weight]) AS digrams') 50 | results = database.execute(query, 'r') 51 | return results 52 | -------------------------------------------------------------------------------- /GraphOfDocs/algos.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains wrapper functions that 3 | call algorithms in the database, 4 | such as Pagerank, Louvain Community Detection, 5 | and Jaccard Similarity Measure. 6 | Their implementantions are located 7 | in the Neo4j Algorithms library. 8 | """ 9 | 10 | def pagerank(database, node, edge, iterations, property, weight = ''): 11 | type_correct = all([isinstance(node, str), 12 | isinstance(edge, str), 13 | isinstance(iterations, int), 14 | isinstance(property, str), 15 | isinstance(weight, str)]) 16 | 17 | if not type_correct: 18 | raise TypeError('All arguments should be strings, except iterations which should be int!') 19 | 20 | if weight: # If weight is not an empty str. 21 | weight = f', weightProperty: {weight}' 22 | 23 | query = (f'CALL algo.pageRank("{node}", "{edge}", ' 24 | f'{{iterations: {iterations}, dampingFactor: 0.85, write: true, writeProperty: "{property}"'+ weight +'}) ' 25 | 'YIELD nodes, iterations, loadMillis, computeMillis, writeMillis, dampingFactor, write, writeProperty') 26 | database.execute(query, 'w') 27 | return 28 | 29 | def louvain(database, node, edge, property, weight = ''): 30 | type_correct = all([isinstance(node, str), 31 | isinstance(edge, str), 32 | isinstance(property, str), 33 | isinstance(weight, str)]) 34 | 35 | if not type_correct: 36 | raise TypeError('All arguments should be strings!') 37 | 38 | if weight: # If weight is not an empty str. 39 | weight = ', weightProperty: "'+ weight +'"' 40 | 41 | query = (f'CALL algo.louvain("{node}", "{edge}", ' 42 | f'{{direction: "BOTH", writeProperty: "{property}"'+ weight +'}) ' 43 | 'YIELD nodes, communityCount, iterations, loadMillis, computeMillis, writeMillis') 44 | database.execute(query, 'w') 45 | return 46 | 47 | def jaccard(database, source, edge, target, cutoff, relationship, property): 48 | type_correct = all([isinstance(source, str), 49 | isinstance(edge, str), 50 | isinstance(target, str), 51 | isinstance(relationship, str), 52 | isinstance(property, str), 53 | isinstance(cutoff, float)]) 54 | 55 | if not type_correct: 56 | raise TypeError('All arguments should be strings, except cutoff which should be a float!') 57 | 58 | query = ( 59 | f'MATCH (d:{source})-[:{edge}]->(w:{target}) ' 60 | 'WITH {item:id(d), categories: collect(id(w))} as data ' 61 | 'WITH collect(data) as Data ' 62 | f'CALL algo.similarity.jaccard(Data, {{topK: 1, similarityCutoff: {cutoff}, write: true, writeRelationshipType: "{relationship}", writeProperty: "{property}"}}) ' 63 | 'YIELD nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, stdDev, p25, p50, p75, p90, p95, p99, p999, p100 ' 64 | 'RETURN nodes, similarityPairs, write, writeRelationshipType, writeProperty, min, max, mean, p95 ') 65 | database.execute(query, 'w') 66 | return -------------------------------------------------------------------------------- /GraphOfDocs/visualize.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Dataviz 7 | 8 | 9 | 10 | 11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | 20 |
21 |
22 | 23 |
24 |
25 | 26 |
27 |
28 | 29 |
30 |
31 | 32 |
33 |
34 | 35 |
36 |
37 | 38 |
39 |
40 | 41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 | 50 | 51 |
52 |
53 |
54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /GraphOfDocs/web/custom/custom.js: -------------------------------------------------------------------------------- 1 | //Viz is a global object and it is created once. 2 | var viz; 3 | 4 | $(document).ready(function () { 5 | var query = "MATCH (n:Word)-[r:connects]-(k) " 6 | + "WHERE n.pagerank > 90 " 7 | + "AND k.pagerank > 90 " 8 | + "AND n.pagerank < 200 " 9 | + "AND k.pagerank < 200 " 10 | + "RETURN n,r,k LIMIT 1000"; 11 | draw(query); 12 | }); 13 | 14 | $("#query").click(function () { 15 | var start = $("#field1").val(); 16 | var end = $("#field2").val(); 17 | var score = $("#field3").val(); 18 | if (start === "" || end === "" || score === "") { 19 | alert("Please speficy the ranges and/or score!"); 20 | return; 21 | } 22 | // Build the query based on the above values. 23 | var query = "MATCH (n:Word)-[r:connects]-(k) " 24 | + "WHERE n.pagerank > " + start + " " 25 | + "AND k.pagerank > " + start + " " 26 | + "AND n.pagerank < " + end + " " 27 | + "AND k.pagerank < " + end + " " 28 | + "AND r.weight >= " + score + " " 29 | + "RETURN n,r,k LIMIT 1000"; 30 | viz.renderWithCypher(query); 31 | }); 32 | 33 | $("#stabilize").click(function () { 34 | viz.stabilize(); 35 | }) 36 | 37 | $("#textarea").keyup(function (e) { 38 | var code = e.keyCode ? e.keyCode : e.which; 39 | if (code === 13) { // Enter key pressed. 40 | var query = $("#textarea").val().replace(/\r?\n|\r/g, ""); 41 | // Set value back to retain the query 42 | // without any newline characters. 43 | $("#textarea").val(query); 44 | if (query === ""){ 45 | alert("Please supply a query!"); 46 | return; 47 | } 48 | viz.renderWithCypher(query); 49 | return; 50 | } 51 | }); 52 | 53 | function draw(query) { 54 | // Create a config object for viz. 55 | var config = { 56 | container_id: "viz", 57 | server_url: "bolt://localhost:7687", 58 | server_user: "neo4j", 59 | server_password: "123", 60 | labels: { 61 | "Word": { 62 | caption: "key", 63 | size: "pagerank", 64 | community: "community" 65 | }, 66 | "Document":{ 67 | caption: "filename", 68 | size: "none", 69 | community: "community" 70 | } 71 | }, 72 | relationships: { 73 | "connects": { 74 | caption: "weight", 75 | thickness: "weight" 76 | }, 77 | "includes": { 78 | caption: "none", 79 | thickness: "none" 80 | }, 81 | "is_similar": { 82 | caption: "score", 83 | thickness: "score" 84 | }, 85 | "has_tag": { 86 | caption: "none", 87 | thickness: "none" 88 | } 89 | }, 90 | initial_cypher: query 91 | } 92 | viz = new NeoVis.default(config); 93 | viz.render(); 94 | return 95 | } 96 | -------------------------------------------------------------------------------- /GraphOfDocs/parse_news.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import string 3 | import platform 4 | from pathlib import Path 5 | from utils import clear_screen 6 | from parse_reviews import find_all, get_tag_value 7 | 8 | """ 9 | Function that read an sgml-like-syntax file containing multiple reuters news sgml tags, 10 | which are processed to have their plaintext extracted, without any metadata. 11 | Finally, each news story is being written into its own file, in the output directory. 12 | """ 13 | def convert_sgml_news_to_files(filepath): 14 | current_system = platform.system() 15 | with open(filepath, 'rt', encoding = 'utf-8-sig', errors = 'ignore') as file: 16 | # Read file contents and remove newline characters. 17 | text = file.read().replace('\n', ' ').replace('\r', '') 18 | # Remove non-printable characters from string. 19 | text = ''.join(filter(lambda x: x in string.printable, text)) 20 | 21 | count = 1 22 | total_count = text.count(' 25 | for start, end in zip(find_all(text, '')): 26 | # Print the number of the currently processed news story. 27 | print(f'Processing {count} out of {total_count} news stories...') 28 | # Adjust the index to point after the starting tag. 29 | start = start + offset 30 | 31 | # Retrieve all content from these sgml tags. 32 | topics = get_tag_value(text, 'TOPICS', start, end) 33 | title = get_tag_value(text, 'TITLE', start, end) 34 | news_text = get_tag_value(text, 'BODY', start, end) 35 | # If the news story lacks these fields proceed to the next one. 36 | if topics is None or title is None or news_text is None: 37 | count = count + 1 38 | clear_screen(current_system) 39 | continue 40 | 41 | # Remove tags that encapsulate topic values, and separate them with periods. 42 | topics = topics.replace('', '').replace('', '.') 43 | # Remove last period separator from string. 44 | topics = topics[:-1] 45 | # Join all plaintext information on a single string, then write it in a file. 46 | document_text = '\n'.join((title, news_text)) 47 | 48 | # Each news story will have a filename consisting of _ 49 | filename = '_'.join((topics, str(count))) 50 | # If the filename already exists, the file will be overwritten. 51 | # All files will be created in the default folder: output/ 52 | # If the directory already exists, there will be raised no exception. 53 | Path('output').mkdir(exist_ok = True) 54 | with open(''.join(('output/', filename)), 'w') as document: 55 | document.write(document_text) 56 | count = count + 1 57 | # Clear the screen to output the update the progress counter. 58 | clear_screen(current_system) 59 | return 60 | 61 | if __name__ == '__main__': 62 | if(len(sys.argv) > 1): 63 | # Filepath is expected to be the 2nd argument. 64 | convert_sgml_news_to_files(sys.argv[1]) 65 | else: 66 | print('Please input a file path, after parse_news.py.') 67 | -------------------------------------------------------------------------------- /GraphOfDocs/parse_issues.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import string 4 | import platform 5 | from pathlib import Path 6 | from utils import clear_screen 7 | 8 | """ 9 | Function that reads an json file containing multiple jira issues, 10 | which are processed to have their plaintext extracted, without any metadata. 11 | Finally, each issue is being written into its own file, in the output directory. 12 | """ 13 | def convert_json_issues_to_files(filepath): 14 | current_system = platform.system() 15 | # List that contains the chosen assignees 16 | assignees = [ 17 | 'aantonenko', 'andrus', 'atkach', 'wesmckinn', 'julianhyde', 18 | 'andy.seaborne', 'purplecabbage', 'ababiichuk', 'jbellis', 19 | 'batik-dev@xmlgraphics.apache.org', 'djohnson', 'ancosen', 20 | 'elserj', 'bowserj', 'onechiporenko'] 21 | 22 | with open(filepath, 'rt', encoding = 'utf-8-sig', errors = 'ignore') as dataset: 23 | # Load the json object in memory as a list of dictionaries. 24 | issues = json.load(dataset)['issues'] 25 | count = 1 26 | skip = 0 27 | total_count = len(issues) 28 | # Iterate all issues. 29 | for issue in issues: 30 | # Retrieve all important fields from the dictionary. 31 | # Print the number of the currently processed issue. 32 | print(f'Processing {count} out of {total_count} issues...') 33 | issue_key = issue['key'] 34 | issue_summary = ('' if issue['fields']['summary'] is None else issue['fields']['summary']) 35 | issue_description = ('' if issue['fields']['description'] is None else issue['fields']['description']) 36 | assignee_key = issue['fields']['assignee']['key'] 37 | assignee_key = assignee_key.translate({ord(c): '' for c in '\'\"/*:?<>|_'}) 38 | 39 | # Choose the top 15 assignees of the dataset. 40 | if assignee_key not in assignees: 41 | count = count + 1 42 | clear_screen(current_system) 43 | continue 44 | 45 | # Each issue will become a file. 46 | # The filename is derived from the following convention. 47 | # _ 48 | file_name = '_'.join((assignee_key, issue_key)) 49 | document_text = '\n'.join((issue_summary, issue_description)) 50 | document_text = ''.join(filter(lambda x: x in string.printable, document_text)) 51 | 52 | # Skip issues with empty text. 53 | if document_text == '': 54 | skip = skip + 1 55 | continue 56 | 57 | # If the filename already exists, the file will be overwritten. 58 | # All files will be created in the default folder: output/ 59 | # If the directory already exists, there will be raised no exception. 60 | Path('output').mkdir(exist_ok = True) 61 | with open(''.join(('output/', file_name)), 'w') as document: 62 | document.write(document_text) 63 | count = count + 1 64 | # Clear the screen to output the update the progress counter. 65 | clear_screen(current_system) 66 | # Print Completed and skip items if any. 67 | print(f'Loaded {count - skip} issues, skipped {skip} empty items.') 68 | return 69 | 70 | if __name__ == '__main__': 71 | if(len(sys.argv) > 1): 72 | # Filepath is expected to be the 2nd argument. 73 | convert_json_issues_to_files(sys.argv[1]) 74 | else: 75 | print('Please input a file path, after parse_issues.py.') 76 | -------------------------------------------------------------------------------- /GraphOfDocs.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import platform 3 | from neo4j import ServiceUnavailable 4 | from GraphOfDocs.neo4j_wrapper import Neo4jDatabase 5 | from GraphOfDocs.utils import generate_words, read_dataset, clear_screen 6 | from GraphOfDocs.parse_args import parser 7 | from GraphOfDocs.create import * 8 | 9 | def graphofdocs(create, initialize, dirpath, window_size, 10 | extend_window, remove_stopwords, lemmatize, stem): 11 | 12 | # List that retains the skipped filenames. 13 | skipped = [] 14 | current_system = platform.system() 15 | # Open the database. 16 | try: 17 | database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123') 18 | # Neo4j server is unavailable. 19 | # This client app cannot open a connection. 20 | except ServiceUnavailable as error: 21 | print('\t* Neo4j database is unavailable.') 22 | print('\t* Please check the database connection before running this app.') 23 | input('\t* Press any key to exit the app...') 24 | sys.exit(1) 25 | 26 | if create: 27 | # Delete nodes from previous iterations. 28 | database.execute('MATCH (n) DETACH DELETE n', 'w') 29 | 30 | # Create uniqueness constraint on key to avoid duplicate word nodes. 31 | database.execute('CREATE CONSTRAINT ON (word:Word) ASSERT word.key IS UNIQUE', 'w') 32 | 33 | # Read text from files, which becomes a string in a list called dataset. 34 | dataset = read_dataset(dirpath) 35 | count = 1 36 | total_count = len(dataset) 37 | # Iterate all file records of the dataset. 38 | for filename, file in dataset: 39 | # Print the number of the currently processed file. 40 | print(f'Processing {count} out of {total_count} files...' ) 41 | # Generate the terms from the text of each file. 42 | words = generate_words(file, extend_window, remove_stopwords, lemmatize, stem) 43 | # Create the graph of words in the database. 44 | value = create_graph_of_words(words, database, filename, window_size) 45 | if value is not None: 46 | skipped.append(value) 47 | # Update the progress counter. 48 | count = count + 1 49 | # Clear the screen to output the update the progress counter. 50 | clear_screen(current_system) 51 | # Count all skipped files and write their filenames in skipped.log 52 | skip_count = len(skipped) 53 | print(f'Created {total_count - skip_count}, skipped {skip_count} files.') 54 | print('Check skipped.log for info.') 55 | with open('skipped.log', 'w') as log: 56 | for item in skipped: 57 | log.write(item + '\n') 58 | 59 | if initialize: 60 | # Run initialization functions. 61 | run_initial_algorithms(database) 62 | create_similarity_graph(database) 63 | create_clustering_tags(database) 64 | 65 | database.close() 66 | return 67 | 68 | if __name__ == '__main__': 69 | # If only one argument is specified, 70 | # Then it's the script name. 71 | # Print help for using the script and exit. 72 | if len(sys.argv) == 1: 73 | parser.print_help() 74 | parser.exit() 75 | 76 | # Parse all arguments from terminal. 77 | args = parser.parse_args() 78 | 79 | # If create flag is set but no dirpath is specified, print error. 80 | if args.create and args.dirpath is None: 81 | parser.error('Please set the dirpath flag and specify a valid filepath!') 82 | # Else if create flag is specified along with a valid dirpath. 83 | elif args.create: 84 | print(args) 85 | # Run the graphofdocs function with create and initialize set to True. 86 | # The first argument (0th index) after the dirpath flag is the actual directory path. 87 | graphofdocs(True, True, args.dirpath[0], args.window_size[0], 88 | args.extend_window, args.insert_stopwords, args.lemmatize, args.stem) 89 | # Else if reinitialize flag is specified, unset the create flag. 90 | elif args.reinitialize: 91 | print(args) 92 | # Run the graphofdocs function with create set to False and initialize set to True. 93 | # We also set the directory path to None, since its not needed. 94 | graphofdocs(False, True, None, args.window_size[0], 95 | args.extend_window, args.insert_stopwords, args.lemmatize, args.stem) 96 | -------------------------------------------------------------------------------- /GraphOfDocs/parse_reviews.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import string 4 | import platform 5 | from pathlib import Path 6 | from utils import clear_screen 7 | 8 | """ 9 | Generator function to iterate over all occurences of substring in a string. 10 | """ 11 | def find_all(string, sub): 12 | start = 0 13 | while True: 14 | start = string.find(sub, start) 15 | if start == -1: return 16 | yield start 17 | start += len(sub) 18 | 19 | """ 20 | Function that extracts values, for a given type of xml tag, from an xml string. 21 | If the tag is empty then None is returned. 22 | """ 23 | def get_tag_value(string, type, start, end): 24 | # Construct tags from tag type. 25 | xml_start_tag = ''.join(('<', type, '>')) 26 | xml_end_tag = ''.join(('')) 27 | 28 | # Find the index of the starting tag. 29 | tag_start_idx = string.find(xml_start_tag, start, end) 30 | # Adjust the index to point after the starting tag. 31 | tag_start_idx = tag_start_idx + len(xml_start_tag) 32 | # Find the index of the ending tag. 33 | tag_end_idx = string.find(xml_end_tag, start, end) 34 | 35 | # Starting or ending tag not found, no value to be found. 36 | if tag_start_idx == -1 or tag_end_idx == -1: 37 | value = None 38 | # Both tags have been found but their distance is 0. 39 | # Which means, that they have no value between them. 40 | # E.g 41 | elif (tag_end_idx - tag_start_idx) == 0: 42 | value = None 43 | else: 44 | # Return the value from tags by slicing the string. 45 | value = string[tag_start_idx:tag_end_idx] 46 | return value 47 | 48 | """ 49 | Function that read an xml-like-syntax file containing multiple amazon review xml tags, 50 | which are processed to have their plaintext extracted, without any metadata. 51 | Finally, each review is being written into its own file, in the output directory. 52 | """ 53 | def convert_xml_reviews_to_files(filepath): 54 | current_system = platform.system() 55 | with open(filepath, 'rt', encoding = 'utf-8-sig', errors = 'ignore') as file: 56 | # Read file contents and remove newline characters. 57 | text = file.read().replace('\n', ' ').replace('\r', '') 58 | # Remove non-printable characters from string. 59 | text = ''.join(filter(lambda x: x in string.printable, text)) 60 | # Find the filename, and remove the .review extension. 61 | filename = os.path.basename(filepath) 62 | if filename.endswith('.review'): 63 | filename = filename[:-7] 64 | 65 | count = 1 66 | total_count = text.count('') 67 | offset = len('') 68 | # Iterate through text that has matching xml tags of 69 | for start, end in zip(find_all(text, ''), find_all(text, '')): 70 | # Print the number of the currently processed review. 71 | print(f'Processing {count} out of {total_count} reviews...') 72 | # Adjust the index to point after the starting tag. 73 | start = start + offset 74 | 75 | # Retrieve all content from these xml tags. 76 | product_name = get_tag_value(text, 'product_name', start, end) 77 | title = get_tag_value(text, 'title', start, end) 78 | review_text = get_tag_value(text, 'review_text', start, end) 79 | 80 | # Join all review information on a single string, then write it in a file. 81 | document_text = '\n'.join((product_name, title, review_text)) 82 | 83 | # If the filename already exists, the file will be overwritten. 84 | # All files will be created in the default folder: output/ 85 | # If the directory already exists, there will be raised no exception. 86 | new_filename = '_'.join((filename, str(count))) 87 | Path('output').mkdir(exist_ok = True) 88 | with open(''.join(('output/', new_filename)), 'w') as document: 89 | document.write(document_text) 90 | count = count + 1 91 | # Clear the screen to output the update the progress counter. 92 | clear_screen(current_system) 93 | return 94 | 95 | if __name__ == '__main__': 96 | if(len(sys.argv) > 1): 97 | # Filepath is expected to be the 2nd argument. 98 | convert_xml_reviews_to_files(sys.argv[1]) 99 | else: 100 | print('Please input a file path, after parse_reviews.py.') 101 | -------------------------------------------------------------------------------- /GraphOfDocs/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains utility functions 3 | e.g to read files, preprocess text, etc. 4 | """ 5 | from os import system 6 | from os import listdir 7 | from os.path import isfile, join 8 | from string import punctuation, printable 9 | from nltk import pos_tag, sent_tokenize 10 | from nltk.corpus import wordnet, stopwords 11 | from nltk.stem.wordnet import WordNetLemmatizer 12 | from nltk.stem import PorterStemmer 13 | from nltk.tokenize import word_tokenize 14 | 15 | lemmatizer = WordNetLemmatizer() # Initialize lemmatizer once. 16 | stemmer = PorterStemmer() # Initialize Porter's stemmer once. 17 | 18 | stop_words = set(stopwords.words('english')).union([ # Augment the stopwords set. 19 | 'don','didn', 'doesn', 'aren', 'ain', 'hadn', 20 | 'hasn', 'mightn', 'mustn', 'couldn', 'shouldn', 21 | 'dont', 'didnt', 'doesnt', 'arent', 'aint', 22 | 'hadnt', 'hasnt', 'may', 'mightve', 'couldnt', 23 | 'shouldnt', 'shouldnot', 'shouldntve', 'mustnt', 24 | 'would', 'woulda', 'wouldany', 'wouldnot', 'woudnt', 25 | 'wouldve', 'must', 'could', 'can', 'have', 'has', 26 | 'do', 'does', 'did', 'are', 'is', 'ive', 'cant', 'thats', 27 | 'isnt', 'youre', 'wont', 'from', 'subject', 'hes', 'etc', 28 | 'edu', 'com', 'org', 've', 'll', 'd', 're', 't', 's']) 29 | 30 | def get_wordnet_tag(tag): 31 | """ 32 | Function that maps default part-of-speech 33 | tags to wordnet part-of-speech tags. 34 | """ 35 | if tag.startswith('J'): 36 | return wordnet.ADJ 37 | elif tag.startswith('V'): 38 | return wordnet.VERB 39 | elif tag.startswith('N'): 40 | return wordnet.NOUN 41 | elif tag.startswith('R'): 42 | return wordnet.ADV 43 | else: #default lemmatizer parameter 44 | return wordnet.NOUN 45 | 46 | def generate_words(text, extend_window = False, insert_stopwords = False, lemmatize = False, stem = False): 47 | """ 48 | Function that generates words from a text corpus and optionally lemmatizes them. 49 | Returns a set of unique tokens based on order of appearance in-text. 50 | """ 51 | # Remove all whitespace characters (by split) and join on space. 52 | text = ' '.join(text.split()) 53 | # Handle special characters that connect words. 54 | text = text.translate({ord(c): '' for c in '\'\"'}) 55 | # Find all end of sentences and introduce a special string to track them. 56 | # If they aren't tracked, then the window is allowed to be extended from one sentence to another, 57 | # thus connecting the last terms of one sentence with the starting ones of the next. 58 | # Also, by chaining the replace methods together, a slight amount of performance is achieved, 59 | # over other methods, that have the same output. 60 | if not extend_window: 61 | text = text.replace('. ', ' e5c ')\ 62 | .replace('! ', ' e5c ' )\ 63 | .replace('? ', ' e5c ' ) 64 | # Translate punctuation to space and lowercase the string. 65 | text = text.translate({ord(c): ' ' for c in punctuation}).lower() 66 | # We are cleaning the data from stopwords, numbers and leftover syllabes/letters. 67 | if not insert_stopwords: 68 | tokens = [token for token in word_tokenize(text) 69 | if not token in stop_words and not token.isnumeric() and len(token) > 2] 70 | else: 71 | tokens = word_tokenize(text) 72 | if lemmatize: 73 | tokens_tags = pos_tag(tokens) # Create part-of-speech tags. 74 | # Overwrite the list with the lemmatized versions of tokens. 75 | tokens = [lemmatizer.lemmatize(token, get_wordnet_tag(tag)) for token, tag in tokens_tags] 76 | if stem: 77 | # Overwrite the list with the stemmed versions of tokens. 78 | tokens = [stemmer.stem(token) for token in tokens] 79 | return tokens 80 | 81 | def read_dataset(dirpath): 82 | """ 83 | Function that gets a list of filenames in the directory specified by dirpath, 84 | then reading them in text mode, and appending them in a list which contains the file(name), 85 | and its contents, which have newline characters and non-printable characters removed. 86 | Handles newline endings of '\n' and '\r\n'. 87 | """ 88 | data = [] 89 | # Add trailing slash to directory path, if not present. 90 | dirpath = join(dirpath, '') 91 | files = [file for file in listdir(dirpath) if isfile(join(dirpath, file))] 92 | for file in files: 93 | with open(''.join([dirpath, file]), 'rt', encoding = 'utf-8-sig', errors = 'ignore') as fd: 94 | text = fd.read().replace('\n', ' ').replace('\r', '') 95 | text = ''.join(filter(lambda x: x in printable, text)) 96 | data.append((file, text)) 97 | return data 98 | 99 | def clear_screen(current_system): 100 | if current_system == 'Windows': 101 | system('cls') 102 | else: 103 | system('clear') # Linux/OS X. 104 | return 105 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graph-of-docs Text Representation 2 | 3 | This repository hosts code for the papers: 4 | * [On a novel representation of multiple textual documents in a single graph (KES-IDT 2020)](https://link.springer.com/chapter/10.1007%2F978-981-15-5925-9_9) - [Download](https://github.com/NC0DER/GraphOfDocs/releases/tag/KES-IDT-2020) 5 | * [An innovative graph-based approach to advance feature selection from multiple textual documents (AIAI 2020)](https://link.springer.com/chapter/10.1007%2F978-3-030-49161-1_9) - [Download](https://github.com/NC0DER/GraphOfDocs/archive/master.zip) 6 | 7 | ![image1](https://github.com/NC0DER/GraphOfDocs/blob/master/GraphOfDocs/images/feature_selection.jpg) 8 | 9 | ## Datasets 10 | Available in [this link](https://github.com/imis-lab/aiai-2020-datasets) 11 | 12 | ## Test Results 13 | Edit `GraphOfdocs/config_experiments.py` to setup the experiments and run `experiments.py`. 14 | 15 | ## Installation 16 | **Prequisites:** 17 | * `Windows 10` 64-bit / Debian based `Linux` 64-bit. 18 | * `Python 3` (min. version 3.6), `pip3` (& `py` launcher Windows-only). 19 | * Working `Neo4j` Database (min. version 3.5.12). 20 | 21 | ### Windows 10 22 | Download the project from the green button above, unzip it, 23 | and then open a cmd terminal to this folder and type `pip3 install -r requirements.txt`. 24 | This command will install the neccessary `Python` libraries\* to run the project. 25 | 26 | ### Debian Based Linux 27 | We ran the following commands to update `Python`, `git`, 28 | clone the project to a local folder and install the necessary `Python` libraries\*. 29 | ```bash 30 | sudo apt install python3.6 31 | sudo apt install git-all 32 | git clone https://github.com/NC0DER/GraphOfDocs 33 | cd GraphOfDocs 34 | pip3 install -r requirements.txt 35 | ``` 36 | *\* Optionally you could create a virtual environment first,* 37 | *\* to isolate the libraries from your python user install.* 38 | *\* However the setup script doesn't downgrade existing libraries,* 39 | *\* so there's zero risk in affecting your local user install.* 40 | 41 | ## Database Setup (Windows / Linux) 42 | Create a new database from the `Neo4j` desktop app using 3.5.12 as the min. version. 43 | Update your memory settings to match the following values, 44 | and install the following extra plugins as depicted in the image. 45 | ![image2](https://github.com/NC0DER/GraphOfDocs/blob/master/GraphOfDocs/images/settings.jpg) 46 | *Hint: if you use a dedicated server that only runs `Neo4j`, you could increase these values, 47 | accordingly as specified in the comments of these parameters.* 48 | 49 | Run the `GraphOfDocs.py` script which will create thousands of nodes, 50 | and millions of relationships in the database. 51 | Once it's done, the database is initialized and ready for use. 52 | 53 | ## Running the app 54 | You could use the `Neo4j Browser` to run your queries, 55 | or for large queries you could use the custom visualization tool 56 | `visualize.html` which is located in the `GraphOfDocs` Subdirectory. 57 | 58 | ## Citation 59 | On a novel representation of multiple textual documents in a single graph (KES-IDT 2020) paper: 60 | ``` 61 | Giarelis N., Kanakaris N., Karacapilidis N. (2020) On a Novel Representation of Multiple Textual Documents in a Single Graph. In: Czarnowski I., Howlett R., Jain L. (eds) Intelligent Decision Technologies. IDT 2020. Smart Innovation, Systems and Technologies, vol 193. Springer, Singapore 62 | ``` 63 | 64 | ``` 65 | @InProceedings{10.1007/978-981-15-5925-9_9, 66 | author="Giarelis, Nikolaos 67 | and Kanakaris, Nikos 68 | and Karacapilidis, Nikos", 69 | editor="Czarnowski, Ireneusz 70 | and Howlett, Robert J. 71 | and Jain, Lakhmi C.", 72 | title="On a Novel Representation of Multiple Textual Documents in a Single Graph", 73 | booktitle="Intelligent Decision Technologies", 74 | year="2020", 75 | publisher="Springer Singapore", 76 | address="Singapore", 77 | pages="105--115", 78 | abstract="This paper introduces a novel approach to represent multiple documents as a single graph, namely, the graph-of-docs model, together with an associated novel algorithm for text categorization. The proposed approach enables the investigation of the importance of a term into a whole corpus of documents and supports the inclusion of relationship edges between documents, thus enabling the calculation of important metrics as far as documents are concerned. Compared to well-tried existing solutions, our initial experimentations demonstrate a significant improvement of the accuracy of the text categorization process. For the experimentations reported in this paper, we used a well-known dataset containing about 19,000 documents organized in various subjects.", 79 | isbn="978-981-15-5925-9" 80 | } 81 | ``` 82 | 83 | An innovative graph-based approach to advance feature selection from multiple textual documents (AIAI 2020) paper: 84 | ``` 85 | Giarelis N., Kanakaris N., Karacapilidis N. (2020) An Innovative Graph-Based Approach to Advance Feature Selection from Multiple Textual Documents. In: Maglogiannis I., Iliadis L., Pimenidis E. (eds) Artificial Intelligence Applications and Innovations. AIAI 2020. IFIP Advances in Information and Communication Technology, vol 583. Springer, Cham 86 | ``` 87 | 88 | ``` 89 | @InProceedings{10.1007/978-3-030-49161-1_9, 90 | author="Giarelis, Nikolaos 91 | and Kanakaris, Nikos 92 | and Karacapilidis, Nikos", 93 | editor="Maglogiannis, Ilias 94 | and Iliadis, Lazaros 95 | and Pimenidis, Elias", 96 | title="An Innovative Graph-Based Approach to Advance Feature Selection from Multiple Textual Documents", 97 | booktitle="Artificial Intelligence Applications and Innovations", 98 | year="2020", 99 | publisher="Springer International Publishing", 100 | address="Cham", 101 | pages="96--106", 102 | abstract="This paper introduces a novel graph-based approach to select features from multiple textual documents. The proposed solution enables the investigation of the importance of a term into a whole corpus of documents by utilizing contemporary graph theory methods, such as community detection algorithms and node centrality measures. Compared to well-tried existing solutions, evaluation results show that the proposed approach increases the accuracy of most text classifiers employed and decreases the number of features required to achieve `state-of-the-art' accuracy. Well-known datasets used for the experimentations reported in this paper include 20Newsgroups, LingSpam, Amazon Reviews and Reuters.", 103 | isbn="978-3-030-49161-1" 104 | } 105 | ``` 106 | 107 | ## Contributors 108 | * Nikolaos Giarelis (giarelis@ceid.upatras.gr) 109 | * Nikos Kanakaris (nkanakaris@upnet.gr) 110 | * Nikos Karacapilidis (karacap@upatras.gr) 111 | -------------------------------------------------------------------------------- /experiments.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.warn = lambda *args, **kwards: None # Supress warnings. 3 | 4 | import sys 5 | import pandas as pd 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import LabelEncoder 8 | from sklearn.utils import shuffle 9 | from neo4j import ServiceUnavailable 10 | from GraphOfDocs.neo4j_wrapper import Neo4jDatabase 11 | from GraphOfDocs import utils 12 | from GraphOfDocs import select 13 | from prettytable import PrettyTable 14 | from GraphOfDocs import config_experiments 15 | from GraphOfDocs import evaluation 16 | import timeit 17 | 18 | results_table = PrettyTable( 19 | ['Method', 'Accuracy', 'Number of features', 20 | 'Train size', 'Test size', 'Details']) 21 | evaluation_results = [] 22 | feature_selection_evaluation_results = [] 23 | 24 | start_time = timeit.default_timer() 25 | print('') 26 | print('%'*100) 27 | print('!START OF THE EXPERIMENT!') 28 | #print('DATASET DIR PATH: %s' % config_experiments.DATASET_PATH) 29 | print(f'DATASET DIR PATH: {config_experiments.DATASET_PATH}') 30 | print('MIN NUMBER OF DOCUMENTS PER SELECTED COMMUNITY: ' 31 | f'{config_experiments.MIN_NUMBER_OF_DOCUMENTS_PER_SELECTED_COMMUNITY}') 32 | print('VARIANCE THRESHOLD: ' 33 | f'{config_experiments.VARIANCE_THRESHOLD}') 34 | print('SELECT KBEST K: ' 35 | f'{config_experiments.SELECT_KBEST_K}') 36 | print('TOP N SELECTED COMMUNITY TERMS: ' 37 | f'{config_experiments.TOP_N_SELECTED_COMMUNITY_TERMS}') 38 | 39 | # Connect to database. 40 | try: 41 | database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123') 42 | # Neo4j server is unavailable. 43 | # This client app cannot open a connection. 44 | except ServiceUnavailable as error: 45 | print('\t* Neo4j database is unavailable.') 46 | print('\t* Please check the database connection before running this app.') 47 | input('\t* Press any key to exit the app...') 48 | sys.exit(1) 49 | # Retrieve the communities of documents and their filenames. 50 | doc_communities = select.get_communities_filenames(database) 51 | # Keep only the communities with more than one documents. 52 | filtered_doc_communities = \ 53 | [doc_community for doc_community in doc_communities 54 | if doc_community[2] >= 55 | config_experiments.MIN_NUMBER_OF_DOCUMENTS_PER_SELECTED_COMMUNITY] 56 | # Fetch the selected documents. 57 | selected_docs = sum([docs for _, docs, _ in filtered_doc_communities], []) 58 | # Map community id to documents. 59 | doc_communities_dict = {community_id: docs 60 | for community_id, docs, number_of_docs 61 | in filtered_doc_communities} 62 | # Map document to community id. 63 | doc_to_community_dict = {doc: community_id 64 | for community_id, doc_community, _ 65 | in filtered_doc_communities for doc in doc_community} 66 | print(f'Number of selected documents: {len(selected_docs)}') 67 | # Read dataset, clean dataset and create a pandas dataframe of the dataset. 68 | dataset = utils.read_dataset(config_experiments.DATASET_PATH) 69 | # Create a label encoder (map classes to integer numbers). 70 | le = LabelEncoder() 71 | # The class of each document can be found by simply split (character '_') its filename. E.g. comp.sys.mac.hardware_51712. 72 | le.fit([config_experiments.extract_file_class(file[0]) for file in dataset]) 73 | # Tuple: file identifier, file class, file class number, file text. 74 | clean_dataset = [(file[0], 75 | config_experiments.extract_file_class(file[0]), 76 | le.transform([config_experiments.extract_file_class(file[0])])[0], 77 | ' '.join(utils.generate_words( 78 | file[1], 79 | extend_window = True, 80 | insert_stopwords = False, 81 | lemmatize = False, stem = False))) 82 | for file in dataset] 83 | df = pd.DataFrame(clean_dataset, 84 | columns = ['identifier', 'class', 'class_number', 'text']) 85 | df_all = df 86 | 87 | # Keep only the selected documents (i.e. the document from the community with more than 1 documents). 88 | df = df[df['identifier'].isin(selected_docs)] 89 | df = shuffle(df, random_state = 42) 90 | print('EXAMPLE OF THE PANDAS DATAFRAME') 91 | print(df.head(2)) 92 | 93 | # Number of unique classes 94 | print(f'Number of unique classes: {le.classes_.shape}') 95 | 96 | X = df['text'] 97 | y = df['class_number'] 98 | positions = [i for i in range(len(X))] 99 | positions_train, positions_test = train_test_split( 100 | positions, test_size =0.33, random_state = 42) 101 | 102 | res = evaluation.BOWEvaluator()\ 103 | .evaluate(X, y, results_table = results_table, 104 | classifiers = config_experiments.classifiers) 105 | evaluation_results.extend(res) 106 | 107 | res = evaluation.MetaFeatureSelectionEvaluator()\ 108 | .evaluate(X, y, results_table = results_table, 109 | classifiers = config_experiments.classifiers) 110 | evaluation_results.extend(res) 111 | 112 | for variance_threshold in config_experiments.VARIANCE_THRESHOLD: 113 | res = evaluation\ 114 | .LowVarianceFeatureSelectionEvaluator( 115 | variance_threshold=variance_threshold)\ 116 | .evaluate(X, y, results_table = results_table, 117 | classifiers = config_experiments.classifiers) 118 | evaluation_results.extend(res) 119 | feature_selection_evaluation_results.extend(res) 120 | 121 | for kbest_k in config_experiments.SELECT_KBEST_K: 122 | res = evaluation\ 123 | .SelectKBestFeatureSelectionEvaluator( 124 | kbest=kbest_k)\ 125 | .evaluate(X, y, results_table = results_table, 126 | classifiers=config_experiments.classifiers) 127 | evaluation_results.extend(res) 128 | feature_selection_evaluation_results.extend(res) 129 | 130 | evaluation.GraphOfDocsClassifier( 131 | doc_to_community_dict, doc_communities_dict)\ 132 | .calculate_accuracy(df['identifier'], results_table = results_table) 133 | 134 | for top_n in config_experiments.TOP_N_SELECTED_COMMUNITY_TERMS: 135 | res = evaluation\ 136 | .TopNOfEachCommunityEvaluator( 137 | top_n, doc_to_community_dict, 138 | doc_communities_dict)\ 139 | .evaluate( 140 | X, y, df = df, 141 | positions_train = positions_train, 142 | database = database, 143 | results_table = results_table, 144 | classifiers = config_experiments.classifiers) 145 | evaluation_results.extend(res) 146 | feature_selection_evaluation_results.extend(res) 147 | 148 | df_evaluation_results = pd.DataFrame(evaluation_results) 149 | df_feature_selection_evaluation_results = \ 150 | pd.DataFrame(feature_selection_evaluation_results) 151 | print('EXAMPLE OF THE EVALUATION RESULTS PANDAS DATAFRAME') 152 | print(df_evaluation_results.head(2)) 153 | 154 | results_table.sortby = 'Accuracy' 155 | results_table.reversesort = True 156 | print(results_table) 157 | 158 | output_dir = config_experiments.EXPERIMENTAL_RESULTS_OUΤPUT_DIR 159 | plots_prefix = config_experiments.PLOTS_PREFIX 160 | df_evaluation_results.to_csv(f'{output_dir}/{plots_prefix}_evaluation_results.csv') 161 | evaluation.generate_plots( 162 | df_feature_selection_evaluation_results, 163 | output_dir = output_dir, 164 | plots_prefix = f'{plots_prefix}_feature_selection', 165 | show_only = False) 166 | 167 | database.close() 168 | stop_time = timeit.default_timer() 169 | print(f'Execution time: {stop_time - start_time}') 170 | print('!END OF THE EXPERIMENT!') 171 | print('%'*100) 172 | print('') 173 | -------------------------------------------------------------------------------- /GraphOfDocs/create.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains functions that 3 | create data in the Neo4j database. 4 | """ 5 | import platform 6 | from GraphOfDocs.utils import clear_screen 7 | from GraphOfDocs.algos import * 8 | from GraphOfDocs.select import get_communities_filenames, get_communities_tags 9 | 10 | # Initialize an empty set of edges. 11 | edges = {} 12 | # Initialize an empty list of unique terms. 13 | # We are using a list to preserver order of appearance. 14 | nodes = [] 15 | 16 | def create_graph_of_words(words, database, filename, window_size = 4): 17 | """ 18 | Function that creates a Graph of Words that contains all nodes from each document for easy comparison, 19 | inside the neo4j database, using the appropriate cypher queries. 20 | """ 21 | 22 | # Files that have word length < window size, are skipped. 23 | # Window size ranges from 2 to 6. 24 | length = len(words) 25 | if (length < window_size): 26 | # Early exit, we return the skipped filename 27 | return filename 28 | 29 | # We are using a global set of edges to avoid creating duplicate edges between different graph of words. 30 | # Basically the co-occurences will be merged. 31 | global edges 32 | 33 | # We are using a global set of edges to avoid creating duplicate nodes between different graph of words. 34 | # A list is being used to respect the order of appearance. 35 | global nodes 36 | 37 | # We are getting the unique terms for the current graph of words. 38 | terms = [] 39 | for word in words: 40 | if word not in terms: 41 | terms.append(word) 42 | # Remove end-of-sentence token, so it doesn't get created. 43 | if 'e5c' in terms: 44 | terms.remove('e5c') 45 | # If the word doesn't exist as a node, then create it. 46 | for word in terms: 47 | if word not in nodes: 48 | database.execute(f'CREATE (w:Word {{key: "{word}"}})', 'w') 49 | # Append word to the global node graph, to avoid duplicate creation. 50 | nodes.append(word) 51 | 52 | 53 | 54 | # Create unique connections between existing nodes of the graph. 55 | for i, current in enumerate(words): 56 | # If there are leftover items smaller than the window size, reduce it. 57 | if i + window_size > length: 58 | window_size = window_size - 1 59 | # If the current word is the end of sentence string, 60 | # we need to skip it, in order to go to the words of the next sentence, 61 | # without connecting words of different sentences, in the database. 62 | if current == 'e5c': 63 | continue 64 | # Connect the current element with the next elements of the window size. 65 | for j in range(1, window_size): 66 | next = words[i + j] 67 | # Reached the end of sentence string. 68 | # We can't connect words of different sentences, 69 | # therefore we need to pick a new current word, 70 | # by going back out to the outer loop. 71 | if next == 'e5c': 72 | break 73 | edge = (current, next) 74 | if edge in edges: 75 | # If the edge, exists just update its weight. 76 | edges[edge] = edges[edge] + 1 77 | query = (f'MATCH (w1:Word {{key: "{current}"}})-[r:connects]-(w2:Word {{key: "{next}"}}) ' 78 | f'SET r.weight = {edges[edge]}') 79 | else: 80 | # Else, create it, with a starting weight of 1 meaning first co-occurence. 81 | edges[edge] = 1 82 | query = (f'MATCH (w1:Word {{key: "{current}"}}) ' 83 | f'MATCH (w2:Word {{key: "{next}"}}) ' 84 | f'MERGE (w1)-[r:connects {{weight: {edges[edge]}}}]-(w2)') 85 | # This line of code, is meant to be executed, in both cases of the if...else statement. 86 | database.execute(query, 'w') 87 | 88 | # Create a parent node that represents the document itself. 89 | # This node is connected to all words of its own graph, 90 | # and will be used for similarity/comparison queries. 91 | database.execute(f'CREATE (d:Document {{filename: "{filename}"}})', 'w') 92 | # Create a word list with comma separated, quoted strings for use in the Cypher query below. 93 | #word_list = ', '.join(f'"{word}"' for word in terms) 94 | query = (f'MATCH (w:Word) WHERE w.key IN {terms} ' 95 | 'WITH collect(w) as words ' 96 | f'MATCH (d:Document {{filename: "{filename}"}}) ' 97 | 'UNWIND words as word ' 98 | 'CREATE (d)-[:includes]->(word)') 99 | database.execute(query, 'w') 100 | return 101 | 102 | def run_initial_algorithms(database): 103 | """ 104 | Function that runs centrality & community detection algorithms, 105 | in order to prepare the data for analysis and visualization. 106 | Pagerank & Louvain are used, respectively. 107 | The calculated score for each node of the algorithms is being stored 108 | on the nodes themselves. 109 | """ 110 | # Append the parameter 'weight' for the weighted version of the algorithm. 111 | pagerank(database, 'Word', 'connects', 20, 'pagerank') 112 | louvain(database, 'Word', 'connects', 'community') 113 | return 114 | 115 | def create_similarity_graph(database): 116 | """ 117 | Function that creates a similarity graph 118 | based on Jaccard similarity measure. 119 | This measure connects the document nodes with each other 120 | using the relationship 'is_similar', 121 | which has the similarity score as a property. 122 | In order to prepare the data for analysis and visualization, 123 | we use Louvain Community detection algorithm. 124 | The calculated community id for each node is being stored 125 | on the nodes themselves. 126 | """ 127 | # Remove similarity edges from previous iterations. 128 | database.execute('MATCH ()-[r:is_similar]->() DELETE r', 'w') 129 | 130 | # Create the similarity graph using Jaccard similarity measure. 131 | jaccard(database, 'Document', 'includes', 'Word', 0.23, 'is_similar', 'score') 132 | 133 | # Find all similar document communities. 134 | # Append the parameter 'score' for the weighted version of the algorithm. 135 | louvain(database, 'Document', 'is_similar', 'community') 136 | print('Similarity graph created.') 137 | return 138 | 139 | def create_clustering_tags(database, top_terms = 25): 140 | """ 141 | This functions creates, in the Neo4j database, 142 | for all communities, the relationships that connect 143 | document nodes of a similarity community with top important 144 | clustering tags for that community, based on the amount of common 145 | appearances between documents and a higher pagerank score. 146 | """ 147 | current_system = platform.system() 148 | # Remove has_tag edges from previous iterations. 149 | database.execute('MATCH ()-[r:has_tag]->() DELETE r', 'w') 150 | 151 | # Get all id numbers from communities and all their assosiated file(name)s. 152 | print('Loading all community ids and their filenames...') 153 | results = get_communities_filenames(database) 154 | 155 | # The communities are ordered by filecount, which means that after the first one found, 156 | # with 1 file all the rest have the same amount of documents. 157 | # These communities are a side effect of the Louvain implementation of Neo4j. 158 | # There is no reason to create tags in isolated communities, since there are no common tags, 159 | # with other documents. Therefore we are going to filter them out of the results list. 160 | index = 0 161 | for result in results: 162 | if result[2] == 1: # filecount == 1 163 | break 164 | index = index + 1 165 | 166 | # Slice the list based on the first found index. 167 | results = results[:index] 168 | # Count all results (rows) for a simple loading screen. 169 | count = 1 170 | total_count = len(results) 171 | 172 | # Get all top tags for each community. 173 | top_tags = get_communities_tags(database, top_terms) 174 | 175 | for [community, filenames, _] in results: 176 | # Print the number of the currently processed community. 177 | print(f'Processing {count} out of {total_count} communities...' ) 178 | try: 179 | tags = top_tags[community] 180 | except KeyError: 181 | print('\t* Error: Community key should exist in dictionary!') 182 | 183 | # Connect filenames of a specific community with all their associated tags. 184 | # Tags are considered to be important words that describe that community, 185 | # and which already exist in the graphofdocs model. 186 | query = (f'UNWIND {filenames} AS filename ' 187 | 'MATCH (d:Document {filename: filename}) ' 188 | f'UNWIND {tags} AS tag ' 189 | 'MATCH (w:Word {key: tag}) ' 190 | 'CREATE (d)-[r:has_tag]->(w)') 191 | database.execute(query, 'w') 192 | 193 | # Update the progress counter. 194 | count = count + 1 195 | # Clear the screen to output the update the progress counter. 196 | clear_screen(current_system) 197 | return 198 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_categories/AMAZON_evaluation_results.csv: -------------------------------------------------------------------------------- 1 | ,Method,Accuracy,Number of features,Train size,Test size,Classifier,variance thershold,kbest,top_n 2 | 0,BOW+NB,0.9838,9771,623,308,NB,,, 3 | 1,BOW+LR,0.9838,9771,623,308,LR,,, 4 | 2,BOW+5NN,0.6039,9771,623,308,5NN,,, 5 | 3,BOW+2NN,0.7532,9771,623,308,2NN,,, 6 | 4,BOW+1KNN,0.75,9771,623,308,1KNN,,, 7 | 5,BOW+LSVM,0.9935,9771,623,308,LSVM,,, 8 | 6,BOW+NN100x50,0.9773,9771,623,308,NN100x50,,, 9 | 7,META+NB,0.987,2935,623,308,NB,,, 10 | 8,META+LR,0.9838,2935,623,308,LR,,, 11 | 9,META+5NN,0.6656,2935,623,308,5NN,,, 12 | 10,META+2NN,0.8344,2935,623,308,2NN,,, 13 | 11,META+1KNN,0.8279,2935,623,308,1KNN,,, 14 | 12,META+LSVM,0.9935,2935,623,308,LSVM,,, 15 | 13,META+NN100x50,0.987,2935,623,308,NN100x50,,, 16 | 14,LVAR+NB,0.9838,9771,623,308,NB,0.0005,, 17 | 15,LVAR+LR,0.9838,9771,623,308,LR,0.0005,, 18 | 16,LVAR+5NN,0.6039,9771,623,308,5NN,0.0005,, 19 | 17,LVAR+2NN,0.7532,9771,623,308,2NN,0.0005,, 20 | 18,LVAR+1KNN,0.75,9771,623,308,1KNN,0.0005,, 21 | 19,LVAR+LSVM,0.9935,9771,623,308,LSVM,0.0005,, 22 | 20,LVAR+NN100x50,0.9773,9771,623,308,NN100x50,0.0005,, 23 | 21,LVAR+NB,0.9838,9771,623,308,NB,0.001,, 24 | 22,LVAR+LR,0.9838,9771,623,308,LR,0.001,, 25 | 23,LVAR+5NN,0.6039,9771,623,308,5NN,0.001,, 26 | 24,LVAR+2NN,0.7532,9771,623,308,2NN,0.001,, 27 | 25,LVAR+1KNN,0.75,9771,623,308,1KNN,0.001,, 28 | 26,LVAR+LSVM,0.9935,9771,623,308,LSVM,0.001,, 29 | 27,LVAR+NN100x50,0.9773,9771,623,308,NN100x50,0.001,, 30 | 28,LVAR+NB,0.9838,9771,623,308,NB,0.0015,, 31 | 29,LVAR+LR,0.9838,9771,623,308,LR,0.0015,, 32 | 30,LVAR+5NN,0.6039,9771,623,308,5NN,0.0015,, 33 | 31,LVAR+2NN,0.7532,9771,623,308,2NN,0.0015,, 34 | 32,LVAR+1KNN,0.75,9771,623,308,1KNN,0.0015,, 35 | 33,LVAR+LSVM,0.9935,9771,623,308,LSVM,0.0015,, 36 | 34,LVAR+NN100x50,0.9773,9771,623,308,NN100x50,0.0015,, 37 | 35,LVAR+NB,0.9838,6000,623,308,NB,0.002,, 38 | 36,LVAR+LR,0.9838,6000,623,308,LR,0.002,, 39 | 37,LVAR+5NN,0.5942,6000,623,308,5NN,0.002,, 40 | 38,LVAR+2NN,0.7305,6000,623,308,2NN,0.002,, 41 | 39,LVAR+1KNN,0.7273,6000,623,308,1KNN,0.002,, 42 | 40,LVAR+LSVM,1.0,6000,623,308,LSVM,0.002,, 43 | 41,LVAR+NN100x50,0.9838,6000,623,308,NN100x50,0.002,, 44 | 42,LVAR+NB,0.9838,6000,623,308,NB,0.003,, 45 | 43,LVAR+LR,0.9838,6000,623,308,LR,0.003,, 46 | 44,LVAR+5NN,0.5942,6000,623,308,5NN,0.003,, 47 | 45,LVAR+2NN,0.7305,6000,623,308,2NN,0.003,, 48 | 46,LVAR+1KNN,0.7273,6000,623,308,1KNN,0.003,, 49 | 47,LVAR+LSVM,1.0,6000,623,308,LSVM,0.003,, 50 | 48,LVAR+NN100x50,0.9838,6000,623,308,NN100x50,0.003,, 51 | 49,LVAR+NB,0.9935,3637,623,308,NB,0.004,, 52 | 50,LVAR+LR,0.9935,3637,623,308,LR,0.004,, 53 | 51,LVAR+5NN,0.6331,3637,623,308,5NN,0.004,, 54 | 52,LVAR+2NN,0.776,3637,623,308,2NN,0.004,, 55 | 53,LVAR+1KNN,0.7727,3637,623,308,1KNN,0.004,, 56 | 54,LVAR+LSVM,1.0,3637,623,308,LSVM,0.004,, 57 | 55,LVAR+NN100x50,0.9805,3637,623,308,NN100x50,0.004,, 58 | 56,LVAR+NB,0.9903,2906,623,308,NB,0.005,, 59 | 57,LVAR+LR,0.9935,2906,623,308,LR,0.005,, 60 | 58,LVAR+5NN,0.6883,2906,623,308,5NN,0.005,, 61 | 59,LVAR+2NN,0.7955,2906,623,308,2NN,0.005,, 62 | 60,LVAR+1KNN,0.7825,2906,623,308,1KNN,0.005,, 63 | 61,LVAR+LSVM,1.0,2906,623,308,LSVM,0.005,, 64 | 62,LVAR+NN100x50,0.9805,2906,623,308,NN100x50,0.005,, 65 | 63,LVAR+NB,0.9935,1719,623,308,NB,0.01,, 66 | 64,LVAR+LR,0.9935,1719,623,308,LR,0.01,, 67 | 65,LVAR+5NN,0.6916,1719,623,308,5NN,0.01,, 68 | 66,LVAR+2NN,0.8084,1719,623,308,2NN,0.01,, 69 | 67,LVAR+1KNN,0.7597,1719,623,308,1KNN,0.01,, 70 | 68,LVAR+LSVM,1.0,1719,623,308,LSVM,0.01,, 71 | 69,LVAR+NN100x50,0.9838,1719,623,308,NN100x50,0.01,, 72 | 70,KBEST+NB,1.0,350,623,308,NB,,350.0, 73 | 71,KBEST+LR,0.9935,350,623,308,LR,,350.0, 74 | 72,KBEST+5NN,0.9513,350,623,308,5NN,,350.0, 75 | 73,KBEST+2NN,0.9838,350,623,308,2NN,,350.0, 76 | 74,KBEST+1KNN,0.9773,350,623,308,1KNN,,350.0, 77 | 75,KBEST+LSVM,1.0,350,623,308,LSVM,,350.0, 78 | 76,KBEST+NN100x50,1.0,350,623,308,NN100x50,,350.0, 79 | 77,KBEST+NB,1.0,500,623,308,NB,,500.0, 80 | 78,KBEST+LR,0.9935,500,623,308,LR,,500.0, 81 | 79,KBEST+5NN,0.8864,500,623,308,5NN,,500.0, 82 | 80,KBEST+2NN,0.9578,500,623,308,2NN,,500.0, 83 | 81,KBEST+1KNN,0.9221,500,623,308,1KNN,,500.0, 84 | 82,KBEST+LSVM,1.0,500,623,308,LSVM,,500.0, 85 | 83,KBEST+NN100x50,0.9935,500,623,308,NN100x50,,500.0, 86 | 84,KBEST+NB,0.9968,1000,623,308,NB,,1000.0, 87 | 85,KBEST+LR,0.9903,1000,623,308,LR,,1000.0, 88 | 86,KBEST+5NN,0.8052,1000,623,308,5NN,,1000.0, 89 | 87,KBEST+2NN,0.9091,1000,623,308,2NN,,1000.0, 90 | 88,KBEST+1KNN,0.8994,1000,623,308,1KNN,,1000.0, 91 | 89,KBEST+LSVM,1.0,1000,623,308,LSVM,,1000.0, 92 | 90,KBEST+NN100x50,0.9805,1000,623,308,NN100x50,,1000.0, 93 | 91,KBEST+NB,0.9838,2000,623,308,NB,,2000.0, 94 | 92,KBEST+LR,0.9838,2000,623,308,LR,,2000.0, 95 | 93,KBEST+5NN,0.7078,2000,623,308,5NN,,2000.0, 96 | 94,KBEST+2NN,0.8247,2000,623,308,2NN,,2000.0, 97 | 95,KBEST+1KNN,0.8019,2000,623,308,1KNN,,2000.0, 98 | 96,KBEST+LSVM,0.9935,2000,623,308,LSVM,,2000.0, 99 | 97,KBEST+NN100x50,0.974,2000,623,308,NN100x50,,2000.0, 100 | 98,KBEST+NB,0.987,3000,623,308,NB,,3000.0, 101 | 99,KBEST+LR,0.9838,3000,623,308,LR,,3000.0, 102 | 100,KBEST+5NN,0.6558,3000,623,308,5NN,,3000.0, 103 | 101,KBEST+2NN,0.7695,3000,623,308,2NN,,3000.0, 104 | 102,KBEST+1KNN,0.7565,3000,623,308,1KNN,,3000.0, 105 | 103,KBEST+LSVM,0.9935,3000,623,308,LSVM,,3000.0, 106 | 104,KBEST+NN100x50,0.9773,3000,623,308,NN100x50,,3000.0, 107 | 105,KBEST+NB,0.987,4000,623,308,NB,,4000.0, 108 | 106,KBEST+LR,0.9838,4000,623,308,LR,,4000.0, 109 | 107,KBEST+5NN,0.5844,4000,623,308,5NN,,4000.0, 110 | 108,KBEST+2NN,0.7435,4000,623,308,2NN,,4000.0, 111 | 109,KBEST+1KNN,0.7403,4000,623,308,1KNN,,4000.0, 112 | 110,KBEST+LSVM,0.9935,4000,623,308,LSVM,,4000.0, 113 | 111,KBEST+NN100x50,0.974,4000,623,308,NN100x50,,4000.0, 114 | 112,KBEST+NB,0.987,5000,623,308,NB,,5000.0, 115 | 113,KBEST+LR,0.9838,5000,623,308,LR,,5000.0, 116 | 114,KBEST+5NN,0.5844,5000,623,308,5NN,,5000.0, 117 | 115,KBEST+2NN,0.7305,5000,623,308,2NN,,5000.0, 118 | 116,KBEST+1KNN,0.7273,5000,623,308,1KNN,,5000.0, 119 | 117,KBEST+LSVM,0.9935,5000,623,308,LSVM,,5000.0, 120 | 118,KBEST+NN100x50,0.9773,5000,623,308,NN100x50,,5000.0, 121 | 119,KBEST+NB,0.987,6000,623,308,NB,,6000.0, 122 | 120,KBEST+LR,0.9838,6000,623,308,LR,,6000.0, 123 | 121,KBEST+5NN,0.5682,6000,623,308,5NN,,6000.0, 124 | 122,KBEST+2NN,0.7468,6000,623,308,2NN,,6000.0, 125 | 123,KBEST+1KNN,0.7305,6000,623,308,1KNN,,6000.0, 126 | 124,KBEST+LSVM,0.9935,6000,623,308,LSVM,,6000.0, 127 | 125,KBEST+NN100x50,0.974,6000,623,308,NN100x50,,6000.0, 128 | 126,KBEST+NB,0.987,7000,623,308,NB,,7000.0, 129 | 127,KBEST+LR,0.9838,7000,623,308,LR,,7000.0, 130 | 128,KBEST+5NN,0.5325,7000,623,308,5NN,,7000.0, 131 | 129,KBEST+2NN,0.7208,7000,623,308,2NN,,7000.0, 132 | 130,KBEST+1KNN,0.711,7000,623,308,1KNN,,7000.0, 133 | 131,KBEST+LSVM,0.9935,7000,623,308,LSVM,,7000.0, 134 | 132,KBEST+NN100x50,0.9773,7000,623,308,NN100x50,,7000.0, 135 | 133,TOPN+NB,0.9903,372,623,308,NB,,,5.0 136 | 134,TOPN+LR,0.9903,372,623,308,LR,,,5.0 137 | 135,TOPN+5NN,0.9805,372,623,308,5NN,,,5.0 138 | 136,TOPN+2NN,0.9805,372,623,308,2NN,,,5.0 139 | 137,TOPN+1KNN,0.9708,372,623,308,1KNN,,,5.0 140 | 138,TOPN+LSVM,0.9935,372,623,308,LSVM,,,5.0 141 | 139,TOPN+NN100x50,0.9935,372,623,308,NN100x50,,,5.0 142 | 140,TOPN+NB,0.9935,1065,623,308,NB,,,10.0 143 | 141,TOPN+LR,0.9935,1065,623,308,LR,,,10.0 144 | 142,TOPN+5NN,0.9318,1065,623,308,5NN,,,10.0 145 | 143,TOPN+2NN,0.9513,1065,623,308,2NN,,,10.0 146 | 144,TOPN+1KNN,0.9448,1065,623,308,1KNN,,,10.0 147 | 145,TOPN+LSVM,1.0,1065,623,308,LSVM,,,10.0 148 | 146,TOPN+NN100x50,0.9935,1065,623,308,NN100x50,,,10.0 149 | 147,TOPN+NB,0.9935,1557,623,308,NB,,,15.0 150 | 148,TOPN+LR,0.9935,1557,623,308,LR,,,15.0 151 | 149,TOPN+5NN,0.7955,1557,623,308,5NN,,,15.0 152 | 150,TOPN+2NN,0.8701,1557,623,308,2NN,,,15.0 153 | 151,TOPN+1KNN,0.8474,1557,623,308,1KNN,,,15.0 154 | 152,TOPN+LSVM,1.0,1557,623,308,LSVM,,,15.0 155 | 153,TOPN+NN100x50,0.9935,1557,623,308,NN100x50,,,15.0 156 | 154,TOPN+NB,0.9935,1940,623,308,NB,,,20.0 157 | 155,TOPN+LR,0.9935,1940,623,308,LR,,,20.0 158 | 156,TOPN+5NN,0.7175,1940,623,308,5NN,,,20.0 159 | 157,TOPN+2NN,0.8182,1940,623,308,2NN,,,20.0 160 | 158,TOPN+1KNN,0.8149,1940,623,308,1KNN,,,20.0 161 | 159,TOPN+LSVM,1.0,1940,623,308,LSVM,,,20.0 162 | 160,TOPN+NN100x50,0.9968,1940,623,308,NN100x50,,,20.0 163 | 161,TOPN+NB,0.987,2372,623,308,NB,,,25.0 164 | 162,TOPN+LR,0.9935,2372,623,308,LR,,,25.0 165 | 163,TOPN+5NN,0.6656,2372,623,308,5NN,,,25.0 166 | 164,TOPN+2NN,0.7922,2372,623,308,2NN,,,25.0 167 | 165,TOPN+1KNN,0.7922,2372,623,308,1KNN,,,25.0 168 | 166,TOPN+LSVM,1.0,2372,623,308,LSVM,,,25.0 169 | 167,TOPN+NN100x50,0.9903,2372,623,308,NN100x50,,,25.0 170 | 168,TOPN+NB,0.9838,4897,623,308,NB,,,50.0 171 | 169,TOPN+LR,0.9838,4897,623,308,LR,,,50.0 172 | 170,TOPN+5NN,0.6201,4897,623,308,5NN,,,50.0 173 | 171,TOPN+2NN,0.7825,4897,623,308,2NN,,,50.0 174 | 172,TOPN+1KNN,0.776,4897,623,308,1KNN,,,50.0 175 | 173,TOPN+LSVM,1.0,4897,623,308,LSVM,,,50.0 176 | 174,TOPN+NN100x50,0.987,4897,623,308,NN100x50,,,50.0 177 | 175,TOPN+NB,0.9838,5777,623,308,NB,,,100.0 178 | 176,TOPN+LR,0.9838,5777,623,308,LR,,,100.0 179 | 177,TOPN+5NN,0.6201,5777,623,308,5NN,,,100.0 180 | 178,TOPN+2NN,0.7727,5777,623,308,2NN,,,100.0 181 | 179,TOPN+1KNN,0.7662,5777,623,308,1KNN,,,100.0 182 | 180,TOPN+LSVM,0.9968,5777,623,308,LSVM,,,100.0 183 | 181,TOPN+NN100x50,0.987,5777,623,308,NN100x50,,,100.0 184 | 182,TOPN+NB,0.9838,6692,623,308,NB,,,250.0 185 | 183,TOPN+LR,0.9838,6692,623,308,LR,,,250.0 186 | 184,TOPN+5NN,0.6136,6692,623,308,5NN,,,250.0 187 | 185,TOPN+2NN,0.763,6692,623,308,2NN,,,250.0 188 | 186,TOPN+1KNN,0.7597,6692,623,308,1KNN,,,250.0 189 | 187,TOPN+LSVM,0.9968,6692,623,308,LSVM,,,250.0 190 | 188,TOPN+NN100x50,0.9773,6692,623,308,NN100x50,,,250.0 191 | 189,TOPN+NB,0.9838,7005,623,308,NB,,,500.0 192 | 190,TOPN+LR,0.9838,7005,623,308,LR,,,500.0 193 | 191,TOPN+5NN,0.6169,7005,623,308,5NN,,,500.0 194 | 192,TOPN+2NN,0.763,7005,623,308,2NN,,,500.0 195 | 193,TOPN+1KNN,0.7597,7005,623,308,1KNN,,,500.0 196 | 194,TOPN+LSVM,0.9968,7005,623,308,LSVM,,,500.0 197 | 195,TOPN+NN100x50,0.9773,7005,623,308,NN100x50,,,500.0 198 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/amazon_sentiment/AMAZON_evaluation_results.csv: -------------------------------------------------------------------------------- 1 | ,Method,Accuracy,Number of features,Train size,Test size,Classifier,variance thershold,kbest,top_n 2 | 0,BOW+NB,0.7208,9771,623,308,NB,,, 3 | 1,BOW+LR,0.763,9771,623,308,LR,,, 4 | 2,BOW+5NN,0.6169,9771,623,308,5NN,,, 5 | 3,BOW+2NN,0.6526,9771,623,308,2NN,,, 6 | 4,BOW+1KNN,0.6429,9771,623,308,1KNN,,, 7 | 5,BOW+LSVM,0.763,9771,623,308,LSVM,,, 8 | 6,BOW+NN100x50,0.7273,9771,623,308,NN100x50,,, 9 | 7,META+NB,0.737,2731,623,308,NB,,, 10 | 8,META+LR,0.789,2731,623,308,LR,,, 11 | 9,META+5NN,0.6623,2731,623,308,5NN,,, 12 | 10,META+2NN,0.6623,2731,623,308,2NN,,, 13 | 11,META+1KNN,0.6558,2731,623,308,1KNN,,, 14 | 12,META+LSVM,0.7727,2731,623,308,LSVM,,, 15 | 13,META+NN100x50,0.75,2731,623,308,NN100x50,,, 16 | 14,LVAR+NB,0.7208,9771,623,308,NB,0.0005,, 17 | 15,LVAR+LR,0.763,9771,623,308,LR,0.0005,, 18 | 16,LVAR+5NN,0.6169,9771,623,308,5NN,0.0005,, 19 | 17,LVAR+2NN,0.6526,9771,623,308,2NN,0.0005,, 20 | 18,LVAR+1KNN,0.6429,9771,623,308,1KNN,0.0005,, 21 | 19,LVAR+LSVM,0.763,9771,623,308,LSVM,0.0005,, 22 | 20,LVAR+NN100x50,0.7273,9771,623,308,NN100x50,0.0005,, 23 | 21,LVAR+NB,0.7208,9771,623,308,NB,0.001,, 24 | 22,LVAR+LR,0.763,9771,623,308,LR,0.001,, 25 | 23,LVAR+5NN,0.6169,9771,623,308,5NN,0.001,, 26 | 24,LVAR+2NN,0.6526,9771,623,308,2NN,0.001,, 27 | 25,LVAR+1KNN,0.6429,9771,623,308,1KNN,0.001,, 28 | 26,LVAR+LSVM,0.763,9771,623,308,LSVM,0.001,, 29 | 27,LVAR+NN100x50,0.7273,9771,623,308,NN100x50,0.001,, 30 | 28,LVAR+NB,0.7208,9771,623,308,NB,0.0015,, 31 | 29,LVAR+LR,0.763,9771,623,308,LR,0.0015,, 32 | 30,LVAR+5NN,0.6169,9771,623,308,5NN,0.0015,, 33 | 31,LVAR+2NN,0.6526,9771,623,308,2NN,0.0015,, 34 | 32,LVAR+1KNN,0.6429,9771,623,308,1KNN,0.0015,, 35 | 33,LVAR+LSVM,0.763,9771,623,308,LSVM,0.0015,, 36 | 34,LVAR+NN100x50,0.7273,9771,623,308,NN100x50,0.0015,, 37 | 35,LVAR+NB,0.724,6000,623,308,NB,0.002,, 38 | 36,LVAR+LR,0.7695,6000,623,308,LR,0.002,, 39 | 37,LVAR+5NN,0.6721,6000,623,308,5NN,0.002,, 40 | 38,LVAR+2NN,0.6688,6000,623,308,2NN,0.002,, 41 | 39,LVAR+1KNN,0.6688,6000,623,308,1KNN,0.002,, 42 | 40,LVAR+LSVM,0.7662,6000,623,308,LSVM,0.002,, 43 | 41,LVAR+NN100x50,0.7435,6000,623,308,NN100x50,0.002,, 44 | 42,LVAR+NB,0.724,6000,623,308,NB,0.003,, 45 | 43,LVAR+LR,0.7695,6000,623,308,LR,0.003,, 46 | 44,LVAR+5NN,0.6721,6000,623,308,5NN,0.003,, 47 | 45,LVAR+2NN,0.6688,6000,623,308,2NN,0.003,, 48 | 46,LVAR+1KNN,0.6688,6000,623,308,1KNN,0.003,, 49 | 47,LVAR+LSVM,0.7662,6000,623,308,LSVM,0.003,, 50 | 48,LVAR+NN100x50,0.7435,6000,623,308,NN100x50,0.003,, 51 | 49,LVAR+NB,0.7338,3637,623,308,NB,0.004,, 52 | 50,LVAR+LR,0.7792,3637,623,308,LR,0.004,, 53 | 51,LVAR+5NN,0.6331,3637,623,308,5NN,0.004,, 54 | 52,LVAR+2NN,0.6721,3637,623,308,2NN,0.004,, 55 | 53,LVAR+1KNN,0.6818,3637,623,308,1KNN,0.004,, 56 | 54,LVAR+LSVM,0.7695,3637,623,308,LSVM,0.004,, 57 | 55,LVAR+NN100x50,0.7468,3637,623,308,NN100x50,0.004,, 58 | 56,LVAR+NB,0.737,2906,623,308,NB,0.005,, 59 | 57,LVAR+LR,0.7727,2906,623,308,LR,0.005,, 60 | 58,LVAR+5NN,0.6656,2906,623,308,5NN,0.005,, 61 | 59,LVAR+2NN,0.6818,2906,623,308,2NN,0.005,, 62 | 60,LVAR+1KNN,0.6883,2906,623,308,1KNN,0.005,, 63 | 61,LVAR+LSVM,0.763,2906,623,308,LSVM,0.005,, 64 | 62,LVAR+NN100x50,0.7403,2906,623,308,NN100x50,0.005,, 65 | 63,LVAR+NB,0.7338,1719,623,308,NB,0.01,, 66 | 64,LVAR+LR,0.7727,1719,623,308,LR,0.01,, 67 | 65,LVAR+5NN,0.7013,1719,623,308,5NN,0.01,, 68 | 66,LVAR+2NN,0.6818,1719,623,308,2NN,0.01,, 69 | 67,LVAR+1KNN,0.6916,1719,623,308,1KNN,0.01,, 70 | 68,LVAR+LSVM,0.7825,1719,623,308,LSVM,0.01,, 71 | 69,LVAR+NN100x50,0.75,1719,623,308,NN100x50,0.01,, 72 | 70,KBEST+NB,0.7403,350,623,308,NB,,350.0, 73 | 71,KBEST+LR,0.724,350,623,308,LR,,350.0, 74 | 72,KBEST+5NN,0.724,350,623,308,5NN,,350.0, 75 | 73,KBEST+2NN,0.6851,350,623,308,2NN,,350.0, 76 | 74,KBEST+1KNN,0.6786,350,623,308,1KNN,,350.0, 77 | 75,KBEST+LSVM,0.711,350,623,308,LSVM,,350.0, 78 | 76,KBEST+NN100x50,0.7338,350,623,308,NN100x50,,350.0, 79 | 77,KBEST+NB,0.75,500,623,308,NB,,500.0, 80 | 78,KBEST+LR,0.724,500,623,308,LR,,500.0, 81 | 79,KBEST+5NN,0.711,500,623,308,5NN,,500.0, 82 | 80,KBEST+2NN,0.6883,500,623,308,2NN,,500.0, 83 | 81,KBEST+1KNN,0.6916,500,623,308,1KNN,,500.0, 84 | 82,KBEST+LSVM,0.7338,500,623,308,LSVM,,500.0, 85 | 83,KBEST+NN100x50,0.7468,500,623,308,NN100x50,,500.0, 86 | 84,KBEST+NB,0.7305,1000,623,308,NB,,1000.0, 87 | 85,KBEST+LR,0.7435,1000,623,308,LR,,1000.0, 88 | 86,KBEST+5NN,0.6721,1000,623,308,5NN,,1000.0, 89 | 87,KBEST+2NN,0.6688,1000,623,308,2NN,,1000.0, 90 | 88,KBEST+1KNN,0.6883,1000,623,308,1KNN,,1000.0, 91 | 89,KBEST+LSVM,0.7532,1000,623,308,LSVM,,1000.0, 92 | 90,KBEST+NN100x50,0.7565,1000,623,308,NN100x50,,1000.0, 93 | 91,KBEST+NB,0.737,2000,623,308,NB,,2000.0, 94 | 92,KBEST+LR,0.7662,2000,623,308,LR,,2000.0, 95 | 93,KBEST+5NN,0.6721,2000,623,308,5NN,,2000.0, 96 | 94,KBEST+2NN,0.6688,2000,623,308,2NN,,2000.0, 97 | 95,KBEST+1KNN,0.6753,2000,623,308,1KNN,,2000.0, 98 | 96,KBEST+LSVM,0.763,2000,623,308,LSVM,,2000.0, 99 | 97,KBEST+NN100x50,0.7468,2000,623,308,NN100x50,,2000.0, 100 | 98,KBEST+NB,0.7403,3000,623,308,NB,,3000.0, 101 | 99,KBEST+LR,0.7727,3000,623,308,LR,,3000.0, 102 | 100,KBEST+5NN,0.6656,3000,623,308,5NN,,3000.0, 103 | 101,KBEST+2NN,0.6331,3000,623,308,2NN,,3000.0, 104 | 102,KBEST+1KNN,0.6201,3000,623,308,1KNN,,3000.0, 105 | 103,KBEST+LSVM,0.7435,3000,623,308,LSVM,,3000.0, 106 | 104,KBEST+NN100x50,0.7338,3000,623,308,NN100x50,,3000.0, 107 | 105,KBEST+NB,0.737,4000,623,308,NB,,4000.0, 108 | 106,KBEST+LR,0.7468,4000,623,308,LR,,4000.0, 109 | 107,KBEST+5NN,0.6623,4000,623,308,5NN,,4000.0, 110 | 108,KBEST+2NN,0.6429,4000,623,308,2NN,,4000.0, 111 | 109,KBEST+1KNN,0.6526,4000,623,308,1KNN,,4000.0, 112 | 110,KBEST+LSVM,0.763,4000,623,308,LSVM,,4000.0, 113 | 111,KBEST+NN100x50,0.75,4000,623,308,NN100x50,,4000.0, 114 | 112,KBEST+NB,0.7435,5000,623,308,NB,,5000.0, 115 | 113,KBEST+LR,0.7597,5000,623,308,LR,,5000.0, 116 | 114,KBEST+5NN,0.6331,5000,623,308,5NN,,5000.0, 117 | 115,KBEST+2NN,0.6623,5000,623,308,2NN,,5000.0, 118 | 116,KBEST+1KNN,0.6623,5000,623,308,1KNN,,5000.0, 119 | 117,KBEST+LSVM,0.7532,5000,623,308,LSVM,,5000.0, 120 | 118,KBEST+NN100x50,0.724,5000,623,308,NN100x50,,5000.0, 121 | 119,KBEST+NB,0.737,6000,623,308,NB,,6000.0, 122 | 120,KBEST+LR,0.763,6000,623,308,LR,,6000.0, 123 | 121,KBEST+5NN,0.6494,6000,623,308,5NN,,6000.0, 124 | 122,KBEST+2NN,0.6494,6000,623,308,2NN,,6000.0, 125 | 123,KBEST+1KNN,0.6558,6000,623,308,1KNN,,6000.0, 126 | 124,KBEST+LSVM,0.763,6000,623,308,LSVM,,6000.0, 127 | 125,KBEST+NN100x50,0.724,6000,623,308,NN100x50,,6000.0, 128 | 126,KBEST+NB,0.7305,7000,623,308,NB,,7000.0, 129 | 127,KBEST+LR,0.7597,7000,623,308,LR,,7000.0, 130 | 128,KBEST+5NN,0.6818,7000,623,308,5NN,,7000.0, 131 | 129,KBEST+2NN,0.6883,7000,623,308,2NN,,7000.0, 132 | 130,KBEST+1KNN,0.6786,7000,623,308,1KNN,,7000.0, 133 | 131,KBEST+LSVM,0.7403,7000,623,308,LSVM,,7000.0, 134 | 132,KBEST+NN100x50,0.7305,7000,623,308,NN100x50,,7000.0, 135 | 133,TOPN+NB,0.7013,372,623,308,NB,,,5.0 136 | 134,TOPN+LR,0.6818,372,623,308,LR,,,5.0 137 | 135,TOPN+5NN,0.6688,372,623,308,5NN,,,5.0 138 | 136,TOPN+2NN,0.6883,372,623,308,2NN,,,5.0 139 | 137,TOPN+1KNN,0.6883,372,623,308,1KNN,,,5.0 140 | 138,TOPN+LSVM,0.6591,372,623,308,LSVM,,,5.0 141 | 139,TOPN+NN100x50,0.7013,372,623,308,NN100x50,,,5.0 142 | 140,TOPN+NB,0.6786,1065,623,308,NB,,,10.0 143 | 141,TOPN+LR,0.7565,1065,623,308,LR,,,10.0 144 | 142,TOPN+5NN,0.6786,1065,623,308,5NN,,,10.0 145 | 143,TOPN+2NN,0.6981,1065,623,308,2NN,,,10.0 146 | 144,TOPN+1KNN,0.711,1065,623,308,1KNN,,,10.0 147 | 145,TOPN+LSVM,0.737,1065,623,308,LSVM,,,10.0 148 | 146,TOPN+NN100x50,0.737,1065,623,308,NN100x50,,,10.0 149 | 147,TOPN+NB,0.6948,1557,623,308,NB,,,15.0 150 | 148,TOPN+LR,0.7565,1557,623,308,LR,,,15.0 151 | 149,TOPN+5NN,0.6688,1557,623,308,5NN,,,15.0 152 | 150,TOPN+2NN,0.6818,1557,623,308,2NN,,,15.0 153 | 151,TOPN+1KNN,0.6851,1557,623,308,1KNN,,,15.0 154 | 152,TOPN+LSVM,0.737,1557,623,308,LSVM,,,15.0 155 | 153,TOPN+NN100x50,0.7305,1557,623,308,NN100x50,,,15.0 156 | 154,TOPN+NB,0.7273,1940,623,308,NB,,,20.0 157 | 155,TOPN+LR,0.7662,1940,623,308,LR,,,20.0 158 | 156,TOPN+5NN,0.6331,1940,623,308,5NN,,,20.0 159 | 157,TOPN+2NN,0.6688,1940,623,308,2NN,,,20.0 160 | 158,TOPN+1KNN,0.6786,1940,623,308,1KNN,,,20.0 161 | 159,TOPN+LSVM,0.7532,1940,623,308,LSVM,,,20.0 162 | 160,TOPN+NN100x50,0.7403,1940,623,308,NN100x50,,,20.0 163 | 161,TOPN+NB,0.7208,2372,623,308,NB,,,25.0 164 | 162,TOPN+LR,0.7565,2372,623,308,LR,,,25.0 165 | 163,TOPN+5NN,0.6299,2372,623,308,5NN,,,25.0 166 | 164,TOPN+2NN,0.6526,2372,623,308,2NN,,,25.0 167 | 165,TOPN+1KNN,0.6623,2372,623,308,1KNN,,,25.0 168 | 166,TOPN+LSVM,0.7565,2372,623,308,LSVM,,,25.0 169 | 167,TOPN+NN100x50,0.7305,2372,623,308,NN100x50,,,25.0 170 | 168,TOPN+NB,0.7208,4897,623,308,NB,,,50.0 171 | 169,TOPN+LR,0.776,4897,623,308,LR,,,50.0 172 | 170,TOPN+5NN,0.6461,4897,623,308,5NN,,,50.0 173 | 171,TOPN+2NN,0.6753,4897,623,308,2NN,,,50.0 174 | 172,TOPN+1KNN,0.6851,4897,623,308,1KNN,,,50.0 175 | 173,TOPN+LSVM,0.763,4897,623,308,LSVM,,,50.0 176 | 174,TOPN+NN100x50,0.7045,4897,623,308,NN100x50,,,50.0 177 | 175,TOPN+NB,0.7208,5777,623,308,NB,,,100.0 178 | 176,TOPN+LR,0.7662,5777,623,308,LR,,,100.0 179 | 177,TOPN+5NN,0.6364,5777,623,308,5NN,,,100.0 180 | 178,TOPN+2NN,0.6753,5777,623,308,2NN,,,100.0 181 | 179,TOPN+1KNN,0.6883,5777,623,308,1KNN,,,100.0 182 | 180,TOPN+LSVM,0.7403,5777,623,308,LSVM,,,100.0 183 | 181,TOPN+NN100x50,0.7143,5777,623,308,NN100x50,,,100.0 184 | 182,TOPN+NB,0.7175,6692,623,308,NB,,,250.0 185 | 183,TOPN+LR,0.7695,6692,623,308,LR,,,250.0 186 | 184,TOPN+5NN,0.6396,6692,623,308,5NN,,,250.0 187 | 185,TOPN+2NN,0.6591,6692,623,308,2NN,,,250.0 188 | 186,TOPN+1KNN,0.6656,6692,623,308,1KNN,,,250.0 189 | 187,TOPN+LSVM,0.7435,6692,623,308,LSVM,,,250.0 190 | 188,TOPN+NN100x50,0.7078,6692,623,308,NN100x50,,,250.0 191 | 189,TOPN+NB,0.7208,7005,623,308,NB,,,500.0 192 | 190,TOPN+LR,0.7662,7005,623,308,LR,,,500.0 193 | 191,TOPN+5NN,0.6299,7005,623,308,5NN,,,500.0 194 | 192,TOPN+2NN,0.6591,7005,623,308,2NN,,,500.0 195 | 193,TOPN+1KNN,0.6656,7005,623,308,1KNN,,,500.0 196 | 194,TOPN+LSVM,0.7403,7005,623,308,LSVM,,,500.0 197 | 195,TOPN+NN100x50,0.7175,7005,623,308,NN100x50,,,500.0 198 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/LINGSPAM_evaluation_results.csv: -------------------------------------------------------------------------------- 1 | ,Method,Accuracy,Number of features,Train size,Test size,Classifier,variance thershold,kbest,top_n 2 | 0,BOW+NB,0.9963,16695,546,270,NB,,, 3 | 1,BOW+LR,1.0,16695,546,270,LR,,, 4 | 2,BOW+5NN,0.8333,16695,546,270,5NN,,, 5 | 3,BOW+2NN,0.9074,16695,546,270,2NN,,, 6 | 4,BOW+1KNN,0.9074,16695,546,270,1KNN,,, 7 | 5,BOW+LSVM,1.0,16695,546,270,LSVM,,, 8 | 6,BOW+NN100x50,0.9963,16695,546,270,NN100x50,,, 9 | 7,META+NB,0.9963,2509,546,270,NB,,, 10 | 8,META+LR,1.0,2509,546,270,LR,,, 11 | 9,META+5NN,0.8704,2509,546,270,5NN,,, 12 | 10,META+2NN,0.9222,2509,546,270,2NN,,, 13 | 11,META+1KNN,0.9222,2509,546,270,1KNN,,, 14 | 12,META+LSVM,1.0,2509,546,270,LSVM,,, 15 | 13,META+NN100x50,1.0,2509,546,270,NN100x50,,, 16 | 14,LVAR+NB,0.9963,16695,546,270,NB,0.0005,, 17 | 15,LVAR+LR,1.0,16695,546,270,LR,0.0005,, 18 | 16,LVAR+5NN,0.8333,16695,546,270,5NN,0.0005,, 19 | 17,LVAR+2NN,0.9074,16695,546,270,2NN,0.0005,, 20 | 18,LVAR+1KNN,0.9074,16695,546,270,1KNN,0.0005,, 21 | 19,LVAR+LSVM,1.0,16695,546,270,LSVM,0.0005,, 22 | 20,LVAR+NN100x50,0.9963,16695,546,270,NN100x50,0.0005,, 23 | 21,LVAR+NB,0.9963,16695,546,270,NB,0.001,, 24 | 22,LVAR+LR,1.0,16695,546,270,LR,0.001,, 25 | 23,LVAR+5NN,0.8333,16695,546,270,5NN,0.001,, 26 | 24,LVAR+2NN,0.9074,16695,546,270,2NN,0.001,, 27 | 25,LVAR+1KNN,0.9074,16695,546,270,1KNN,0.001,, 28 | 26,LVAR+LSVM,1.0,16695,546,270,LSVM,0.001,, 29 | 27,LVAR+NN100x50,0.9963,16695,546,270,NN100x50,0.001,, 30 | 28,LVAR+NB,0.9963,16695,546,270,NB,0.0015,, 31 | 29,LVAR+LR,1.0,16695,546,270,LR,0.0015,, 32 | 30,LVAR+5NN,0.8333,16695,546,270,5NN,0.0015,, 33 | 31,LVAR+2NN,0.9074,16695,546,270,2NN,0.0015,, 34 | 32,LVAR+1KNN,0.9074,16695,546,270,1KNN,0.0015,, 35 | 33,LVAR+LSVM,1.0,16695,546,270,LSVM,0.0015,, 36 | 34,LVAR+NN100x50,0.9963,16695,546,270,NN100x50,0.0015,, 37 | 35,LVAR+NB,0.9963,11058,546,270,NB,0.002,, 38 | 36,LVAR+LR,1.0,11058,546,270,LR,0.002,, 39 | 37,LVAR+5NN,0.8296,11058,546,270,5NN,0.002,, 40 | 38,LVAR+2NN,0.9185,11058,546,270,2NN,0.002,, 41 | 39,LVAR+1KNN,0.9185,11058,546,270,1KNN,0.002,, 42 | 40,LVAR+LSVM,0.9963,11058,546,270,LSVM,0.002,, 43 | 41,LVAR+NN100x50,1.0,11058,546,270,NN100x50,0.002,, 44 | 42,LVAR+NB,0.9963,11058,546,270,NB,0.003,, 45 | 43,LVAR+LR,1.0,11058,546,270,LR,0.003,, 46 | 44,LVAR+5NN,0.8296,11058,546,270,5NN,0.003,, 47 | 45,LVAR+2NN,0.9185,11058,546,270,2NN,0.003,, 48 | 46,LVAR+1KNN,0.9185,11058,546,270,1KNN,0.003,, 49 | 47,LVAR+LSVM,0.9963,11058,546,270,LSVM,0.003,, 50 | 48,LVAR+NN100x50,1.0,11058,546,270,NN100x50,0.003,, 51 | 49,LVAR+NB,0.9963,8234,546,270,NB,0.004,, 52 | 50,LVAR+LR,1.0,8234,546,270,LR,0.004,, 53 | 51,LVAR+5NN,0.837,8234,546,270,5NN,0.004,, 54 | 52,LVAR+2NN,0.9185,8234,546,270,2NN,0.004,, 55 | 53,LVAR+1KNN,0.9185,8234,546,270,1KNN,0.004,, 56 | 54,LVAR+LSVM,0.9963,8234,546,270,LSVM,0.004,, 57 | 55,LVAR+NN100x50,0.9963,8234,546,270,NN100x50,0.004,, 58 | 56,LVAR+NB,0.9963,8234,546,270,NB,0.005,, 59 | 57,LVAR+LR,1.0,8234,546,270,LR,0.005,, 60 | 58,LVAR+5NN,0.837,8234,546,270,5NN,0.005,, 61 | 59,LVAR+2NN,0.9185,8234,546,270,2NN,0.005,, 62 | 60,LVAR+1KNN,0.9185,8234,546,270,1KNN,0.005,, 63 | 61,LVAR+LSVM,0.9963,8234,546,270,LSVM,0.005,, 64 | 62,LVAR+NN100x50,0.9963,8234,546,270,NN100x50,0.005,, 65 | 63,LVAR+NB,0.9815,5464,546,270,NB,0.01,, 66 | 64,LVAR+LR,0.9963,5464,546,270,LR,0.01,, 67 | 65,LVAR+5NN,0.8926,5464,546,270,5NN,0.01,, 68 | 66,LVAR+2NN,0.9222,5464,546,270,2NN,0.01,, 69 | 67,LVAR+1KNN,0.9222,5464,546,270,1KNN,0.01,, 70 | 68,LVAR+LSVM,0.9963,5464,546,270,LSVM,0.01,, 71 | 69,LVAR+NN100x50,0.9963,5464,546,270,NN100x50,0.01,, 72 | 70,KBEST+NB,0.9778,1000,546,270,NB,,1000.0, 73 | 71,KBEST+LR,1.0,1000,546,270,LR,,1000.0, 74 | 72,KBEST+5NN,0.9593,1000,546,270,5NN,,1000.0, 75 | 73,KBEST+2NN,0.9815,1000,546,270,2NN,,1000.0, 76 | 74,KBEST+1KNN,0.9815,1000,546,270,1KNN,,1000.0, 77 | 75,KBEST+LSVM,1.0,1000,546,270,LSVM,,1000.0, 78 | 76,KBEST+NN100x50,1.0,1000,546,270,NN100x50,,1000.0, 79 | 77,KBEST+NB,0.9778,2000,546,270,NB,,2000.0, 80 | 78,KBEST+LR,0.9963,2000,546,270,LR,,2000.0, 81 | 79,KBEST+5NN,0.9741,2000,546,270,5NN,,2000.0, 82 | 80,KBEST+2NN,0.9741,2000,546,270,2NN,,2000.0, 83 | 81,KBEST+1KNN,0.9741,2000,546,270,1KNN,,2000.0, 84 | 82,KBEST+LSVM,0.9963,2000,546,270,LSVM,,2000.0, 85 | 83,KBEST+NN100x50,0.9963,2000,546,270,NN100x50,,2000.0, 86 | 84,KBEST+NB,0.9963,3000,546,270,NB,,3000.0, 87 | 85,KBEST+LR,0.9963,3000,546,270,LR,,3000.0, 88 | 86,KBEST+5NN,0.9667,3000,546,270,5NN,,3000.0, 89 | 87,KBEST+2NN,0.9815,3000,546,270,2NN,,3000.0, 90 | 88,KBEST+1KNN,0.9889,3000,546,270,1KNN,,3000.0, 91 | 89,KBEST+LSVM,0.9963,3000,546,270,LSVM,,3000.0, 92 | 90,KBEST+NN100x50,0.9926,3000,546,270,NN100x50,,3000.0, 93 | 91,KBEST+NB,0.9963,4000,546,270,NB,,4000.0, 94 | 92,KBEST+LR,0.9963,4000,546,270,LR,,4000.0, 95 | 93,KBEST+5NN,0.963,4000,546,270,5NN,,4000.0, 96 | 94,KBEST+2NN,0.9852,4000,546,270,2NN,,4000.0, 97 | 95,KBEST+1KNN,0.9852,4000,546,270,1KNN,,4000.0, 98 | 96,KBEST+LSVM,0.9963,4000,546,270,LSVM,,4000.0, 99 | 97,KBEST+NN100x50,0.9926,4000,546,270,NN100x50,,4000.0, 100 | 98,KBEST+NB,0.9963,5000,546,270,NB,,5000.0, 101 | 99,KBEST+LR,0.9963,5000,546,270,LR,,5000.0, 102 | 100,KBEST+5NN,0.9778,5000,546,270,5NN,,5000.0, 103 | 101,KBEST+2NN,0.9741,5000,546,270,2NN,,5000.0, 104 | 102,KBEST+1KNN,0.9741,5000,546,270,1KNN,,5000.0, 105 | 103,KBEST+LSVM,0.9963,5000,546,270,LSVM,,5000.0, 106 | 104,KBEST+NN100x50,0.9926,5000,546,270,NN100x50,,5000.0, 107 | 105,KBEST+NB,0.9963,6000,546,270,NB,,6000.0, 108 | 106,KBEST+LR,0.9963,6000,546,270,LR,,6000.0, 109 | 107,KBEST+5NN,0.9667,6000,546,270,5NN,,6000.0, 110 | 108,KBEST+2NN,0.9667,6000,546,270,2NN,,6000.0, 111 | 109,KBEST+1KNN,0.9667,6000,546,270,1KNN,,6000.0, 112 | 110,KBEST+LSVM,0.9963,6000,546,270,LSVM,,6000.0, 113 | 111,KBEST+NN100x50,0.9926,6000,546,270,NN100x50,,6000.0, 114 | 112,KBEST+NB,0.9963,7000,546,270,NB,,7000.0, 115 | 113,KBEST+LR,0.9963,7000,546,270,LR,,7000.0, 116 | 114,KBEST+5NN,0.9481,7000,546,270,5NN,,7000.0, 117 | 115,KBEST+2NN,0.9556,7000,546,270,2NN,,7000.0, 118 | 116,KBEST+1KNN,0.9556,7000,546,270,1KNN,,7000.0, 119 | 117,KBEST+LSVM,0.9963,7000,546,270,LSVM,,7000.0, 120 | 118,KBEST+NN100x50,0.9926,7000,546,270,NN100x50,,7000.0, 121 | 119,KBEST+NB,0.9963,10000,546,270,NB,,10000.0, 122 | 120,KBEST+LR,1.0,10000,546,270,LR,,10000.0, 123 | 121,KBEST+5NN,0.8889,10000,546,270,5NN,,10000.0, 124 | 122,KBEST+2NN,0.9222,10000,546,270,2NN,,10000.0, 125 | 123,KBEST+1KNN,0.9222,10000,546,270,1KNN,,10000.0, 126 | 124,KBEST+LSVM,0.9963,10000,546,270,LSVM,,10000.0, 127 | 125,KBEST+NN100x50,0.9963,10000,546,270,NN100x50,,10000.0, 128 | 126,KBEST+NB,0.9963,14000,546,270,NB,,14000.0, 129 | 127,KBEST+LR,1.0,14000,546,270,LR,,14000.0, 130 | 128,KBEST+5NN,0.837,14000,546,270,5NN,,14000.0, 131 | 129,KBEST+2NN,0.9222,14000,546,270,2NN,,14000.0, 132 | 130,KBEST+1KNN,0.9222,14000,546,270,1KNN,,14000.0, 133 | 131,KBEST+LSVM,0.9963,14000,546,270,LSVM,,14000.0, 134 | 132,KBEST+NN100x50,0.9963,14000,546,270,NN100x50,,14000.0, 135 | 133,TOPN+NB,0.9815,120,546,270,NB,,,5.0 136 | 134,TOPN+LR,1.0,120,546,270,LR,,,5.0 137 | 135,TOPN+5NN,0.9926,120,546,270,5NN,,,5.0 138 | 136,TOPN+2NN,0.9852,120,546,270,2NN,,,5.0 139 | 137,TOPN+1KNN,0.9852,120,546,270,1KNN,,,5.0 140 | 138,TOPN+LSVM,0.9963,120,546,270,LSVM,,,5.0 141 | 139,TOPN+NN100x50,0.9963,120,546,270,NN100x50,,,5.0 142 | 140,TOPN+NB,0.9778,296,546,270,NB,,,10.0 143 | 141,TOPN+LR,1.0,296,546,270,LR,,,10.0 144 | 142,TOPN+5NN,0.9704,296,546,270,5NN,,,10.0 145 | 143,TOPN+2NN,0.9556,296,546,270,2NN,,,10.0 146 | 144,TOPN+1KNN,0.9593,296,546,270,1KNN,,,10.0 147 | 145,TOPN+LSVM,0.9963,296,546,270,LSVM,,,10.0 148 | 146,TOPN+NN100x50,0.9926,296,546,270,NN100x50,,,10.0 149 | 147,TOPN+NB,0.9815,526,546,270,NB,,,15.0 150 | 148,TOPN+LR,1.0,526,546,270,LR,,,15.0 151 | 149,TOPN+5NN,0.9481,526,546,270,5NN,,,15.0 152 | 150,TOPN+2NN,0.9667,526,546,270,2NN,,,15.0 153 | 151,TOPN+1KNN,0.9667,526,546,270,1KNN,,,15.0 154 | 152,TOPN+LSVM,0.9963,526,546,270,LSVM,,,15.0 155 | 153,TOPN+NN100x50,0.9926,526,546,270,NN100x50,,,15.0 156 | 154,TOPN+NB,0.9778,758,546,270,NB,,,20.0 157 | 155,TOPN+LR,1.0,758,546,270,LR,,,20.0 158 | 156,TOPN+5NN,0.9407,758,546,270,5NN,,,20.0 159 | 157,TOPN+2NN,0.9519,758,546,270,2NN,,,20.0 160 | 158,TOPN+1KNN,0.9519,758,546,270,1KNN,,,20.0 161 | 159,TOPN+LSVM,0.9963,758,546,270,LSVM,,,20.0 162 | 160,TOPN+NN100x50,1.0,758,546,270,NN100x50,,,20.0 163 | 161,TOPN+NB,0.9778,995,546,270,NB,,,25.0 164 | 162,TOPN+LR,1.0,995,546,270,LR,,,25.0 165 | 163,TOPN+5NN,0.9259,995,546,270,5NN,,,25.0 166 | 164,TOPN+2NN,0.9519,995,546,270,2NN,,,25.0 167 | 165,TOPN+1KNN,0.9519,995,546,270,1KNN,,,25.0 168 | 166,TOPN+LSVM,0.9963,995,546,270,LSVM,,,25.0 169 | 167,TOPN+NN100x50,1.0,995,546,270,NN100x50,,,25.0 170 | 168,TOPN+NB,0.9963,2274,546,270,NB,,,50.0 171 | 169,TOPN+LR,1.0,2274,546,270,LR,,,50.0 172 | 170,TOPN+5NN,0.8926,2274,546,270,5NN,,,50.0 173 | 171,TOPN+2NN,0.9333,2274,546,270,2NN,,,50.0 174 | 172,TOPN+1KNN,0.9333,2274,546,270,1KNN,,,50.0 175 | 173,TOPN+LSVM,0.9963,2274,546,270,LSVM,,,50.0 176 | 174,TOPN+NN100x50,1.0,2274,546,270,NN100x50,,,50.0 177 | 175,TOPN+NB,0.9963,4443,546,270,NB,,,100.0 178 | 176,TOPN+LR,1.0,4443,546,270,LR,,,100.0 179 | 177,TOPN+5NN,0.8778,4443,546,270,5NN,,,100.0 180 | 178,TOPN+2NN,0.9333,4443,546,270,2NN,,,100.0 181 | 179,TOPN+1KNN,0.9333,4443,546,270,1KNN,,,100.0 182 | 180,TOPN+LSVM,0.9963,4443,546,270,LSVM,,,100.0 183 | 181,TOPN+NN100x50,0.9963,4443,546,270,NN100x50,,,100.0 184 | 182,TOPN+NB,0.9963,8975,546,270,NB,,,250.0 185 | 183,TOPN+LR,1.0,8975,546,270,LR,,,250.0 186 | 184,TOPN+5NN,0.8481,8975,546,270,5NN,,,250.0 187 | 185,TOPN+2NN,0.9259,8975,546,270,2NN,,,250.0 188 | 186,TOPN+1KNN,0.9259,8975,546,270,1KNN,,,250.0 189 | 187,TOPN+LSVM,0.9963,8975,546,270,LSVM,,,250.0 190 | 188,TOPN+NN100x50,1.0,8975,546,270,NN100x50,,,250.0 191 | 189,TOPN+NB,0.9963,12104,546,270,NB,,,500.0 192 | 190,TOPN+LR,1.0,12104,546,270,LR,,,500.0 193 | 191,TOPN+5NN,0.8407,12104,546,270,5NN,,,500.0 194 | 192,TOPN+2NN,0.9259,12104,546,270,2NN,,,500.0 195 | 193,TOPN+1KNN,0.9259,12104,546,270,1KNN,,,500.0 196 | 194,TOPN+LSVM,0.9963,12104,546,270,LSVM,,,500.0 197 | 195,TOPN+NN100x50,0.9963,12104,546,270,NN100x50,,,500.0 198 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/reuters/REUTERS_evaluation_results.csv: -------------------------------------------------------------------------------- 1 | ,Method,Accuracy,Number of features,Train size,Test size,Classifier,variance thershold,kbest,top_n 2 | 0,BOW+NB,0.8191,15514,4501,2217,NB,,, 3 | 1,BOW+LR,0.8746,15514,4501,2217,LR,,, 4 | 2,BOW+5NN,0.7582,15514,4501,2217,5NN,,, 5 | 3,BOW+2NN,0.802,15514,4501,2217,2NN,,, 6 | 4,BOW+1KNN,0.7997,15514,4501,2217,1KNN,,, 7 | 5,BOW+LSVM,0.8742,15514,4501,2217,LSVM,,, 8 | 6,BOW+NN100x50,0.8656,15514,4501,2217,NN100x50,,, 9 | 7,META+NB,0.8376,2494,4501,2217,NB,,, 10 | 8,META+LR,0.876,2494,4501,2217,LR,,, 11 | 9,META+5NN,0.8002,2494,4501,2217,5NN,,, 12 | 10,META+2NN,0.8223,2494,4501,2217,2NN,,, 13 | 11,META+1KNN,0.8245,2494,4501,2217,1KNN,,, 14 | 12,META+LSVM,0.8746,2494,4501,2217,LSVM,,, 15 | 13,META+NN100x50,0.8701,2494,4501,2217,NN100x50,,, 16 | 14,LVAR+NB,0.8372,7624,4501,2217,NB,0.0005,, 17 | 15,LVAR+LR,0.8755,7624,4501,2217,LR,0.0005,, 18 | 16,LVAR+5NN,0.7677,7624,4501,2217,5NN,0.0005,, 19 | 17,LVAR+2NN,0.8097,7624,4501,2217,2NN,0.0005,, 20 | 18,LVAR+1KNN,0.8088,7624,4501,2217,1KNN,0.0005,, 21 | 19,LVAR+LSVM,0.8746,7624,4501,2217,LSVM,0.0005,, 22 | 20,LVAR+NN100x50,0.8705,7624,4501,2217,NN100x50,0.0005,, 23 | 21,LVAR+NB,0.8363,5780,4501,2217,NB,0.001,, 24 | 22,LVAR+LR,0.8733,5780,4501,2217,LR,0.001,, 25 | 23,LVAR+5NN,0.7826,5780,4501,2217,5NN,0.001,, 26 | 24,LVAR+2NN,0.816,5780,4501,2217,2NN,0.001,, 27 | 25,LVAR+1KNN,0.8137,5780,4501,2217,1KNN,0.001,, 28 | 26,LVAR+LSVM,0.8733,5780,4501,2217,LSVM,0.001,, 29 | 27,LVAR+NN100x50,0.8714,5780,4501,2217,NN100x50,0.001,, 30 | 28,LVAR+NB,0.8367,4870,4501,2217,NB,0.0015,, 31 | 29,LVAR+LR,0.8742,4870,4501,2217,LR,0.0015,, 32 | 30,LVAR+5NN,0.7966,4870,4501,2217,5NN,0.0015,, 33 | 31,LVAR+2NN,0.8205,4870,4501,2217,2NN,0.0015,, 34 | 32,LVAR+1KNN,0.8173,4870,4501,2217,1KNN,0.0015,, 35 | 33,LVAR+LSVM,0.8737,4870,4501,2217,LSVM,0.0015,, 36 | 34,LVAR+NN100x50,0.8724,4870,4501,2217,NN100x50,0.0015,, 37 | 35,LVAR+NB,0.8399,4014,4501,2217,NB,0.002,, 38 | 36,LVAR+LR,0.8737,4014,4501,2217,LR,0.002,, 39 | 37,LVAR+5NN,0.8083,4014,4501,2217,5NN,0.002,, 40 | 38,LVAR+2NN,0.8236,4014,4501,2217,2NN,0.002,, 41 | 39,LVAR+1KNN,0.8232,4014,4501,2217,1KNN,0.002,, 42 | 40,LVAR+LSVM,0.8705,4014,4501,2217,LSVM,0.002,, 43 | 41,LVAR+NN100x50,0.8665,4014,4501,2217,NN100x50,0.002,, 44 | 42,LVAR+NB,0.8403,3356,4501,2217,NB,0.003,, 45 | 43,LVAR+LR,0.8755,3356,4501,2217,LR,0.003,, 46 | 44,LVAR+5NN,0.8101,3356,4501,2217,5NN,0.003,, 47 | 45,LVAR+2NN,0.8236,3356,4501,2217,2NN,0.003,, 48 | 46,LVAR+1KNN,0.8272,3356,4501,2217,1KNN,0.003,, 49 | 47,LVAR+LSVM,0.8714,3356,4501,2217,LSVM,0.003,, 50 | 48,LVAR+NN100x50,0.8687,3356,4501,2217,NN100x50,0.003,, 51 | 49,LVAR+NB,0.8381,2772,4501,2217,NB,0.004,, 52 | 50,LVAR+LR,0.8755,2772,4501,2217,LR,0.004,, 53 | 51,LVAR+5NN,0.8106,2772,4501,2217,5NN,0.004,, 54 | 52,LVAR+2NN,0.8245,2772,4501,2217,2NN,0.004,, 55 | 53,LVAR+1KNN,0.8259,2772,4501,2217,1KNN,0.004,, 56 | 54,LVAR+LSVM,0.8692,2772,4501,2217,LSVM,0.004,, 57 | 55,LVAR+NN100x50,0.8724,2772,4501,2217,NN100x50,0.004,, 58 | 56,LVAR+NB,0.839,2458,4501,2217,NB,0.005,, 59 | 57,LVAR+LR,0.8751,2458,4501,2217,LR,0.005,, 60 | 58,LVAR+5NN,0.811,2458,4501,2217,5NN,0.005,, 61 | 59,LVAR+2NN,0.8254,2458,4501,2217,2NN,0.005,, 62 | 60,LVAR+1KNN,0.8268,2458,4501,2217,1KNN,0.005,, 63 | 61,LVAR+LSVM,0.8692,2458,4501,2217,LSVM,0.005,, 64 | 62,LVAR+NN100x50,0.8724,2458,4501,2217,NN100x50,0.005,, 65 | 63,LVAR+NB,0.8349,1482,4501,2217,NB,0.01,, 66 | 64,LVAR+LR,0.8742,1482,4501,2217,LR,0.01,, 67 | 65,LVAR+5NN,0.8209,1482,4501,2217,5NN,0.01,, 68 | 66,LVAR+2NN,0.8295,1482,4501,2217,2NN,0.01,, 69 | 67,LVAR+1KNN,0.8295,1482,4501,2217,1KNN,0.01,, 70 | 68,LVAR+LSVM,0.8669,1482,4501,2217,LSVM,0.01,, 71 | 69,LVAR+NN100x50,0.8674,1482,4501,2217,NN100x50,0.01,, 72 | 70,KBEST+NB,0.8038,1000,4501,2217,NB,,1000.0, 73 | 71,KBEST+LR,0.8358,1000,4501,2217,LR,,1000.0, 74 | 72,KBEST+5NN,0.7939,1000,4501,2217,5NN,,1000.0, 75 | 73,KBEST+2NN,0.8033,1000,4501,2217,2NN,,1000.0, 76 | 74,KBEST+1KNN,0.8029,1000,4501,2217,1KNN,,1000.0, 77 | 75,KBEST+LSVM,0.8295,1000,4501,2217,LSVM,,1000.0, 78 | 76,KBEST+NN100x50,0.83,1000,4501,2217,NN100x50,,1000.0, 79 | 77,KBEST+NB,0.8169,2000,4501,2217,NB,,2000.0, 80 | 78,KBEST+LR,0.8484,2000,4501,2217,LR,,2000.0, 81 | 79,KBEST+5NN,0.7866,2000,4501,2217,5NN,,2000.0, 82 | 80,KBEST+2NN,0.8097,2000,4501,2217,2NN,,2000.0, 83 | 81,KBEST+1KNN,0.8088,2000,4501,2217,1KNN,,2000.0, 84 | 82,KBEST+LSVM,0.8309,2000,4501,2217,LSVM,,2000.0, 85 | 83,KBEST+NN100x50,0.8331,2000,4501,2217,NN100x50,,2000.0, 86 | 84,KBEST+NB,0.8187,3000,4501,2217,NB,,3000.0, 87 | 85,KBEST+LR,0.8633,3000,4501,2217,LR,,3000.0, 88 | 86,KBEST+5NN,0.793,3000,4501,2217,5NN,,3000.0, 89 | 87,KBEST+2NN,0.8119,3000,4501,2217,2NN,,3000.0, 90 | 88,KBEST+1KNN,0.8124,3000,4501,2217,1KNN,,3000.0, 91 | 89,KBEST+LSVM,0.8507,3000,4501,2217,LSVM,,3000.0, 92 | 90,KBEST+NN100x50,0.8579,3000,4501,2217,NN100x50,,3000.0, 93 | 91,KBEST+NB,0.8272,4000,4501,2217,NB,,4000.0, 94 | 92,KBEST+LR,0.8687,4000,4501,2217,LR,,4000.0, 95 | 93,KBEST+5NN,0.788,4000,4501,2217,5NN,,4000.0, 96 | 94,KBEST+2NN,0.8182,4000,4501,2217,2NN,,4000.0, 97 | 95,KBEST+1KNN,0.8187,4000,4501,2217,1KNN,,4000.0, 98 | 96,KBEST+LSVM,0.8602,4000,4501,2217,LSVM,,4000.0, 99 | 97,KBEST+NN100x50,0.8552,4000,4501,2217,NN100x50,,4000.0, 100 | 98,KBEST+NB,0.8259,5000,4501,2217,NB,,5000.0, 101 | 99,KBEST+LR,0.8687,5000,4501,2217,LR,,5000.0, 102 | 100,KBEST+5NN,0.7889,5000,4501,2217,5NN,,5000.0, 103 | 101,KBEST+2NN,0.8128,5000,4501,2217,2NN,,5000.0, 104 | 102,KBEST+1KNN,0.8142,5000,4501,2217,1KNN,,5000.0, 105 | 103,KBEST+LSVM,0.8638,5000,4501,2217,LSVM,,5000.0, 106 | 104,KBEST+NN100x50,0.8633,5000,4501,2217,NN100x50,,5000.0, 107 | 105,KBEST+NB,0.8277,6000,4501,2217,NB,,6000.0, 108 | 106,KBEST+LR,0.8737,6000,4501,2217,LR,,6000.0, 109 | 107,KBEST+5NN,0.7957,6000,4501,2217,5NN,,6000.0, 110 | 108,KBEST+2NN,0.8191,6000,4501,2217,2NN,,6000.0, 111 | 109,KBEST+1KNN,0.82,6000,4501,2217,1KNN,,6000.0, 112 | 110,KBEST+LSVM,0.8714,6000,4501,2217,LSVM,,6000.0, 113 | 111,KBEST+NN100x50,0.8665,6000,4501,2217,NN100x50,,6000.0, 114 | 112,KBEST+NB,0.8354,7000,4501,2217,NB,,7000.0, 115 | 113,KBEST+LR,0.8751,7000,4501,2217,LR,,7000.0, 116 | 114,KBEST+5NN,0.7952,7000,4501,2217,5NN,,7000.0, 117 | 115,KBEST+2NN,0.8205,7000,4501,2217,2NN,,7000.0, 118 | 116,KBEST+1KNN,0.8205,7000,4501,2217,1KNN,,7000.0, 119 | 117,KBEST+LSVM,0.8737,7000,4501,2217,LSVM,,7000.0, 120 | 118,KBEST+NN100x50,0.8696,7000,4501,2217,NN100x50,,7000.0, 121 | 119,KBEST+NB,0.8336,10000,4501,2217,NB,,10000.0, 122 | 120,KBEST+LR,0.8764,10000,4501,2217,LR,,10000.0, 123 | 121,KBEST+5NN,0.7939,10000,4501,2217,5NN,,10000.0, 124 | 122,KBEST+2NN,0.816,10000,4501,2217,2NN,,10000.0, 125 | 123,KBEST+1KNN,0.8191,10000,4501,2217,1KNN,,10000.0, 126 | 124,KBEST+LSVM,0.8719,10000,4501,2217,LSVM,,10000.0, 127 | 125,KBEST+NN100x50,0.8624,10000,4501,2217,NN100x50,,10000.0, 128 | 126,KBEST+NB,0.8236,14000,4501,2217,NB,,14000.0, 129 | 127,KBEST+LR,0.8728,14000,4501,2217,LR,,14000.0, 130 | 128,KBEST+5NN,0.7497,14000,4501,2217,5NN,,14000.0, 131 | 129,KBEST+2NN,0.7961,14000,4501,2217,2NN,,14000.0, 132 | 130,KBEST+1KNN,0.7948,14000,4501,2217,1KNN,,14000.0, 133 | 131,KBEST+LSVM,0.8737,14000,4501,2217,LSVM,,14000.0, 134 | 132,KBEST+NN100x50,0.8705,14000,4501,2217,NN100x50,,14000.0, 135 | 133,TOPN+NB,0.8033,293,4501,2217,NB,,,5.0 136 | 134,TOPN+LR,0.8399,293,4501,2217,LR,,,5.0 137 | 135,TOPN+5NN,0.8164,293,4501,2217,5NN,,,5.0 138 | 136,TOPN+2NN,0.8083,293,4501,2217,2NN,,,5.0 139 | 137,TOPN+1KNN,0.8078,293,4501,2217,1KNN,,,5.0 140 | 138,TOPN+LSVM,0.8214,293,4501,2217,LSVM,,,5.0 141 | 139,TOPN+NN100x50,0.8336,293,4501,2217,NN100x50,,,5.0 142 | 140,TOPN+NB,0.8295,809,4501,2217,NB,,,10.0 143 | 141,TOPN+LR,0.8642,809,4501,2217,LR,,,10.0 144 | 142,TOPN+5NN,0.8272,809,4501,2217,5NN,,,10.0 145 | 143,TOPN+2NN,0.8259,809,4501,2217,2NN,,,10.0 146 | 144,TOPN+1KNN,0.8241,809,4501,2217,1KNN,,,10.0 147 | 145,TOPN+LSVM,0.8466,809,4501,2217,LSVM,,,10.0 148 | 146,TOPN+NN100x50,0.8548,809,4501,2217,NN100x50,,,10.0 149 | 147,TOPN+NB,0.8331,1615,4501,2217,NB,,,15.0 150 | 148,TOPN+LR,0.8724,1615,4501,2217,LR,,,15.0 151 | 149,TOPN+5NN,0.8232,1615,4501,2217,5NN,,,15.0 152 | 150,TOPN+2NN,0.8313,1615,4501,2217,2NN,,,15.0 153 | 151,TOPN+1KNN,0.8309,1615,4501,2217,1KNN,,,15.0 154 | 152,TOPN+LSVM,0.8606,1615,4501,2217,LSVM,,,15.0 155 | 153,TOPN+NN100x50,0.8674,1615,4501,2217,NN100x50,,,15.0 156 | 154,TOPN+NB,0.8372,2430,4501,2217,NB,,,20.0 157 | 155,TOPN+LR,0.8728,2430,4501,2217,LR,,,20.0 158 | 156,TOPN+5NN,0.8155,2430,4501,2217,5NN,,,20.0 159 | 157,TOPN+2NN,0.8295,2430,4501,2217,2NN,,,20.0 160 | 158,TOPN+1KNN,0.8309,2430,4501,2217,1KNN,,,20.0 161 | 159,TOPN+LSVM,0.8665,2430,4501,2217,LSVM,,,20.0 162 | 160,TOPN+NN100x50,0.8701,2430,4501,2217,NN100x50,,,20.0 163 | 161,TOPN+NB,0.839,3182,4501,2217,NB,,,25.0 164 | 162,TOPN+LR,0.8742,3182,4501,2217,LR,,,25.0 165 | 163,TOPN+5NN,0.811,3182,4501,2217,5NN,,,25.0 166 | 164,TOPN+2NN,0.8272,3182,4501,2217,2NN,,,25.0 167 | 165,TOPN+1KNN,0.8268,3182,4501,2217,1KNN,,,25.0 168 | 166,TOPN+LSVM,0.8678,3182,4501,2217,LSVM,,,25.0 169 | 167,TOPN+NN100x50,0.8687,3182,4501,2217,NN100x50,,,25.0 170 | 168,TOPN+NB,0.8372,5359,4501,2217,NB,,,50.0 171 | 169,TOPN+LR,0.8755,5359,4501,2217,LR,,,50.0 172 | 170,TOPN+5NN,0.802,5359,4501,2217,5NN,,,50.0 173 | 171,TOPN+2NN,0.8263,5359,4501,2217,2NN,,,50.0 174 | 172,TOPN+1KNN,0.8268,5359,4501,2217,1KNN,,,50.0 175 | 173,TOPN+LSVM,0.8724,5359,4501,2217,LSVM,,,50.0 176 | 174,TOPN+NN100x50,0.8714,5359,4501,2217,NN100x50,,,50.0 177 | 175,TOPN+NB,0.8399,7171,4501,2217,NB,,,100.0 178 | 176,TOPN+LR,0.8782,7171,4501,2217,LR,,,100.0 179 | 177,TOPN+5NN,0.7993,7171,4501,2217,5NN,,,100.0 180 | 178,TOPN+2NN,0.8232,7171,4501,2217,2NN,,,100.0 181 | 179,TOPN+1KNN,0.8227,7171,4501,2217,1KNN,,,100.0 182 | 180,TOPN+LSVM,0.8737,7171,4501,2217,LSVM,,,100.0 183 | 181,TOPN+NN100x50,0.8674,7171,4501,2217,NN100x50,,,100.0 184 | 182,TOPN+NB,0.8381,8187,4501,2217,NB,,,250.0 185 | 183,TOPN+LR,0.876,8187,4501,2217,LR,,,250.0 186 | 184,TOPN+5NN,0.7979,8187,4501,2217,5NN,,,250.0 187 | 185,TOPN+2NN,0.8232,8187,4501,2217,2NN,,,250.0 188 | 186,TOPN+1KNN,0.8232,8187,4501,2217,1KNN,,,250.0 189 | 187,TOPN+LSVM,0.8724,8187,4501,2217,LSVM,,,250.0 190 | 188,TOPN+NN100x50,0.8733,8187,4501,2217,NN100x50,,,250.0 191 | 189,TOPN+NB,0.8376,8246,4501,2217,NB,,,500.0 192 | 190,TOPN+LR,0.8778,8246,4501,2217,LR,,,500.0 193 | 191,TOPN+5NN,0.7975,8246,4501,2217,5NN,,,500.0 194 | 192,TOPN+2NN,0.8232,8246,4501,2217,2NN,,,500.0 195 | 193,TOPN+1KNN,0.8232,8246,4501,2217,1KNN,,,500.0 196 | 194,TOPN+LSVM,0.8733,8246,4501,2217,LSVM,,,500.0 197 | 195,TOPN+NN100x50,0.8683,8246,4501,2217,NN100x50,,,500.0 198 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/jira_issues/JIRAISSUES_evaluation_results.csv: -------------------------------------------------------------------------------- 1 | ,Method,Accuracy,Number of features,Train size,Test size,Classifier,variance thershold,kbest,top_n 2 | 0,BOW+NB,0.6989,14539,3229,1591,NB,,, 3 | 1,BOW+LR,0.7461,14539,3229,1591,LR,,, 4 | 2,BOW+5NN,0.6486,14539,3229,1591,5NN,,, 5 | 3,BOW+2NN,0.6644,14539,3229,1591,2NN,,, 6 | 4,BOW+1KNN,0.6637,14539,3229,1591,1KNN,,, 7 | 5,BOW+LSVM,0.7304,14539,3229,1591,LSVM,,, 8 | 6,BOW+NN100x50,0.741,14539,3229,1591,NN100x50,,, 9 | 7,META+NB,0.6989,2942,3229,1591,NB,,, 10 | 8,META+LR,0.7461,2942,3229,1591,LR,,, 11 | 9,META+5NN,0.6329,2942,3229,1591,5NN,,, 12 | 10,META+2NN,0.6662,2942,3229,1591,2NN,,, 13 | 11,META+1KNN,0.6688,2942,3229,1591,1KNN,,, 14 | 12,META+LSVM,0.731,2942,3229,1591,LSVM,,, 15 | 13,META+NN100x50,0.7467,2942,3229,1591,NN100x50,,, 16 | 14,LVAR+NB,0.6933,9706,3229,1591,NB,0.0005,, 17 | 15,LVAR+LR,0.7442,9706,3229,1591,LR,0.0005,, 18 | 16,LVAR+5NN,0.6411,9706,3229,1591,5NN,0.0005,, 19 | 17,LVAR+2NN,0.6593,9706,3229,1591,2NN,0.0005,, 20 | 18,LVAR+1KNN,0.6512,9706,3229,1591,1KNN,0.0005,, 21 | 19,LVAR+LSVM,0.7291,9706,3229,1591,LSVM,0.0005,, 22 | 20,LVAR+NN100x50,0.7379,9706,3229,1591,NN100x50,0.0005,, 23 | 21,LVAR+NB,0.6895,8221,3229,1591,NB,0.001,, 24 | 22,LVAR+LR,0.7404,8221,3229,1591,LR,0.001,, 25 | 23,LVAR+5NN,0.6386,8221,3229,1591,5NN,0.001,, 26 | 24,LVAR+2NN,0.6644,8221,3229,1591,2NN,0.001,, 27 | 25,LVAR+1KNN,0.6562,8221,3229,1591,1KNN,0.001,, 28 | 26,LVAR+LSVM,0.7197,8221,3229,1591,LSVM,0.001,, 29 | 27,LVAR+NN100x50,0.7348,8221,3229,1591,NN100x50,0.001,, 30 | 28,LVAR+NB,0.6882,6489,3229,1591,NB,0.0015,, 31 | 29,LVAR+LR,0.736,6489,3229,1591,LR,0.0015,, 32 | 30,LVAR+5NN,0.6279,6489,3229,1591,5NN,0.0015,, 33 | 31,LVAR+2NN,0.6574,6489,3229,1591,2NN,0.0015,, 34 | 32,LVAR+1KNN,0.6543,6489,3229,1591,1KNN,0.0015,, 35 | 33,LVAR+LSVM,0.7178,6489,3229,1591,LSVM,0.0015,, 36 | 34,LVAR+NN100x50,0.7398,6489,3229,1591,NN100x50,0.0015,, 37 | 35,LVAR+NB,0.6889,5833,3229,1591,NB,0.002,, 38 | 36,LVAR+LR,0.7373,5833,3229,1591,LR,0.002,, 39 | 37,LVAR+5NN,0.6273,5833,3229,1591,5NN,0.002,, 40 | 38,LVAR+2NN,0.6618,5833,3229,1591,2NN,0.002,, 41 | 39,LVAR+1KNN,0.6574,5833,3229,1591,1KNN,0.002,, 42 | 40,LVAR+LSVM,0.719,5833,3229,1591,LSVM,0.002,, 43 | 41,LVAR+NN100x50,0.7304,5833,3229,1591,NN100x50,0.002,, 44 | 42,LVAR+NB,0.6857,4821,3229,1591,NB,0.003,, 45 | 43,LVAR+LR,0.7316,4821,3229,1591,LR,0.003,, 46 | 44,LVAR+5NN,0.6455,4821,3229,1591,5NN,0.003,, 47 | 45,LVAR+2NN,0.6637,4821,3229,1591,2NN,0.003,, 48 | 46,LVAR+1KNN,0.6669,4821,3229,1591,1KNN,0.003,, 49 | 47,LVAR+LSVM,0.7128,4821,3229,1591,LSVM,0.003,, 50 | 48,LVAR+NN100x50,0.7385,4821,3229,1591,NN100x50,0.003,, 51 | 49,LVAR+NB,0.6801,4450,3229,1591,NB,0.004,, 52 | 50,LVAR+LR,0.7322,4450,3229,1591,LR,0.004,, 53 | 51,LVAR+5NN,0.6354,4450,3229,1591,5NN,0.004,, 54 | 52,LVAR+2NN,0.6644,4450,3229,1591,2NN,0.004,, 55 | 53,LVAR+1KNN,0.6656,4450,3229,1591,1KNN,0.004,, 56 | 54,LVAR+LSVM,0.7165,4450,3229,1591,LSVM,0.004,, 57 | 55,LVAR+NN100x50,0.7304,4450,3229,1591,NN100x50,0.004,, 58 | 56,LVAR+NB,0.6719,3766,3229,1591,NB,0.005,, 59 | 57,LVAR+LR,0.7316,3766,3229,1591,LR,0.005,, 60 | 58,LVAR+5NN,0.6354,3766,3229,1591,5NN,0.005,, 61 | 59,LVAR+2NN,0.6499,3766,3229,1591,2NN,0.005,, 62 | 60,LVAR+1KNN,0.6537,3766,3229,1591,1KNN,0.005,, 63 | 61,LVAR+LSVM,0.719,3766,3229,1591,LSVM,0.005,, 64 | 62,LVAR+NN100x50,0.7291,3766,3229,1591,NN100x50,0.005,, 65 | 63,LVAR+NB,0.6713,2771,3229,1591,NB,0.01,, 66 | 64,LVAR+LR,0.7297,2771,3229,1591,LR,0.01,, 67 | 65,LVAR+5NN,0.6493,2771,3229,1591,5NN,0.01,, 68 | 66,LVAR+2NN,0.6449,2771,3229,1591,2NN,0.01,, 69 | 67,LVAR+1KNN,0.6474,2771,3229,1591,1KNN,0.01,, 70 | 68,LVAR+LSVM,0.7134,2771,3229,1591,LSVM,0.01,, 71 | 69,LVAR+NN100x50,0.7285,2771,3229,1591,NN100x50,0.01,, 72 | 70,KBEST+NB,0.6235,1000,3229,1591,NB,,1000.0, 73 | 71,KBEST+LR,0.6776,1000,3229,1591,LR,,1000.0, 74 | 72,KBEST+5NN,0.616,1000,3229,1591,5NN,,1000.0, 75 | 73,KBEST+2NN,0.638,1000,3229,1591,2NN,,1000.0, 76 | 74,KBEST+1KNN,0.6273,1000,3229,1591,1KNN,,1000.0, 77 | 75,KBEST+LSVM,0.675,1000,3229,1591,LSVM,,1000.0, 78 | 76,KBEST+NN100x50,0.6725,1000,3229,1591,NN100x50,,1000.0, 79 | 77,KBEST+NB,0.6719,2000,3229,1591,NB,,2000.0, 80 | 78,KBEST+LR,0.709,2000,3229,1591,LR,,2000.0, 81 | 79,KBEST+5NN,0.6449,2000,3229,1591,5NN,,2000.0, 82 | 80,KBEST+2NN,0.6493,2000,3229,1591,2NN,,2000.0, 83 | 81,KBEST+1KNN,0.6474,2000,3229,1591,1KNN,,2000.0, 84 | 82,KBEST+LSVM,0.7002,2000,3229,1591,LSVM,,2000.0, 85 | 83,KBEST+NN100x50,0.7046,2000,3229,1591,NN100x50,,2000.0, 86 | 84,KBEST+NB,0.6625,3000,3229,1591,NB,,3000.0, 87 | 85,KBEST+LR,0.7184,3000,3229,1591,LR,,3000.0, 88 | 86,KBEST+5NN,0.6449,3000,3229,1591,5NN,,3000.0, 89 | 87,KBEST+2NN,0.648,3000,3229,1591,2NN,,3000.0, 90 | 88,KBEST+1KNN,0.6499,3000,3229,1591,1KNN,,3000.0, 91 | 89,KBEST+LSVM,0.7046,3000,3229,1591,LSVM,,3000.0, 92 | 90,KBEST+NN100x50,0.7165,3000,3229,1591,NN100x50,,3000.0, 93 | 91,KBEST+NB,0.6763,4000,3229,1591,NB,,4000.0, 94 | 92,KBEST+LR,0.7291,4000,3229,1591,LR,,4000.0, 95 | 93,KBEST+5NN,0.6644,4000,3229,1591,5NN,,4000.0, 96 | 94,KBEST+2NN,0.6562,4000,3229,1591,2NN,,4000.0, 97 | 95,KBEST+1KNN,0.6556,4000,3229,1591,1KNN,,4000.0, 98 | 96,KBEST+LSVM,0.7134,4000,3229,1591,LSVM,,4000.0, 99 | 97,KBEST+NN100x50,0.7278,4000,3229,1591,NN100x50,,4000.0, 100 | 98,KBEST+NB,0.682,5000,3229,1591,NB,,5000.0, 101 | 99,KBEST+LR,0.7304,5000,3229,1591,LR,,5000.0, 102 | 100,KBEST+5NN,0.6568,5000,3229,1591,5NN,,5000.0, 103 | 101,KBEST+2NN,0.6644,5000,3229,1591,2NN,,5000.0, 104 | 102,KBEST+1KNN,0.6631,5000,3229,1591,1KNN,,5000.0, 105 | 103,KBEST+LSVM,0.7209,5000,3229,1591,LSVM,,5000.0, 106 | 104,KBEST+NN100x50,0.7234,5000,3229,1591,NN100x50,,5000.0, 107 | 105,KBEST+NB,0.6901,6000,3229,1591,NB,,6000.0, 108 | 106,KBEST+LR,0.7461,6000,3229,1591,LR,,6000.0, 109 | 107,KBEST+5NN,0.6518,6000,3229,1591,5NN,,6000.0, 110 | 108,KBEST+2NN,0.6606,6000,3229,1591,2NN,,6000.0, 111 | 109,KBEST+1KNN,0.6606,6000,3229,1591,1KNN,,6000.0, 112 | 110,KBEST+LSVM,0.7253,6000,3229,1591,LSVM,,6000.0, 113 | 111,KBEST+NN100x50,0.7341,6000,3229,1591,NN100x50,,6000.0, 114 | 112,KBEST+NB,0.6914,7000,3229,1591,NB,,7000.0, 115 | 113,KBEST+LR,0.7354,7000,3229,1591,LR,,7000.0, 116 | 114,KBEST+5NN,0.643,7000,3229,1591,5NN,,7000.0, 117 | 115,KBEST+2NN,0.6562,7000,3229,1591,2NN,,7000.0, 118 | 116,KBEST+1KNN,0.6562,7000,3229,1591,1KNN,,7000.0, 119 | 117,KBEST+LSVM,0.7146,7000,3229,1591,LSVM,,7000.0, 120 | 118,KBEST+NN100x50,0.704,7000,3229,1591,NN100x50,,7000.0, 121 | 119,KBEST+NB,0.6914,10000,3229,1591,NB,,10000.0, 122 | 120,KBEST+LR,0.7454,10000,3229,1591,LR,,10000.0, 123 | 121,KBEST+5NN,0.6455,10000,3229,1591,5NN,,10000.0, 124 | 122,KBEST+2NN,0.6662,10000,3229,1591,2NN,,10000.0, 125 | 123,KBEST+1KNN,0.6631,10000,3229,1591,1KNN,,10000.0, 126 | 124,KBEST+LSVM,0.7128,10000,3229,1591,LSVM,,10000.0, 127 | 125,KBEST+NN100x50,0.7423,10000,3229,1591,NN100x50,,10000.0, 128 | 126,KBEST+NB,0.6952,14000,3229,1591,NB,,14000.0, 129 | 127,KBEST+LR,0.7436,14000,3229,1591,LR,,14000.0, 130 | 128,KBEST+5NN,0.6449,14000,3229,1591,5NN,,14000.0, 131 | 129,KBEST+2NN,0.6675,14000,3229,1591,2NN,,14000.0, 132 | 130,KBEST+1KNN,0.6593,14000,3229,1591,1KNN,,14000.0, 133 | 131,KBEST+LSVM,0.7222,14000,3229,1591,LSVM,,14000.0, 134 | 132,KBEST+NN100x50,0.7335,14000,3229,1591,NN100x50,,14000.0, 135 | 133,TOPN+NB,0.687,905,3229,1591,NB,,,5.0 136 | 134,TOPN+LR,0.7335,905,3229,1591,LR,,,5.0 137 | 135,TOPN+5NN,0.682,905,3229,1591,5NN,,,5.0 138 | 136,TOPN+2NN,0.6675,905,3229,1591,2NN,,,5.0 139 | 137,TOPN+1KNN,0.6744,905,3229,1591,1KNN,,,5.0 140 | 138,TOPN+LSVM,0.719,905,3229,1591,LSVM,,,5.0 141 | 139,TOPN+NN100x50,0.7278,905,3229,1591,NN100x50,,,5.0 142 | 140,TOPN+NB,0.6933,1533,3229,1591,NB,,,10.0 143 | 141,TOPN+LR,0.7392,1533,3229,1591,LR,,,10.0 144 | 142,TOPN+5NN,0.6662,1533,3229,1591,5NN,,,10.0 145 | 143,TOPN+2NN,0.6757,1533,3229,1591,2NN,,,10.0 146 | 144,TOPN+1KNN,0.675,1533,3229,1591,1KNN,,,10.0 147 | 145,TOPN+LSVM,0.7285,1533,3229,1591,LSVM,,,10.0 148 | 146,TOPN+NN100x50,0.7417,1533,3229,1591,NN100x50,,,10.0 149 | 147,TOPN+NB,0.6945,2064,3229,1591,NB,,,15.0 150 | 148,TOPN+LR,0.7467,2064,3229,1591,LR,,,15.0 151 | 149,TOPN+5NN,0.6574,2064,3229,1591,5NN,,,15.0 152 | 150,TOPN+2NN,0.6826,2064,3229,1591,2NN,,,15.0 153 | 151,TOPN+1KNN,0.682,2064,3229,1591,1KNN,,,15.0 154 | 152,TOPN+LSVM,0.7285,2064,3229,1591,LSVM,,,15.0 155 | 153,TOPN+NN100x50,0.7536,2064,3229,1591,NN100x50,,,15.0 156 | 154,TOPN+NB,0.692,2598,3229,1591,NB,,,20.0 157 | 155,TOPN+LR,0.7517,2598,3229,1591,LR,,,20.0 158 | 156,TOPN+5NN,0.6656,2598,3229,1591,5NN,,,20.0 159 | 157,TOPN+2NN,0.6794,2598,3229,1591,2NN,,,20.0 160 | 158,TOPN+1KNN,0.6807,2598,3229,1591,1KNN,,,20.0 161 | 159,TOPN+LSVM,0.7278,2598,3229,1591,LSVM,,,20.0 162 | 160,TOPN+NN100x50,0.7511,2598,3229,1591,NN100x50,,,20.0 163 | 161,TOPN+NB,0.6933,3045,3229,1591,NB,,,25.0 164 | 162,TOPN+LR,0.7473,3045,3229,1591,LR,,,25.0 165 | 163,TOPN+5NN,0.6549,3045,3229,1591,5NN,,,25.0 166 | 164,TOPN+2NN,0.6763,3045,3229,1591,2NN,,,25.0 167 | 165,TOPN+1KNN,0.6788,3045,3229,1591,1KNN,,,25.0 168 | 166,TOPN+LSVM,0.7348,3045,3229,1591,LSVM,,,25.0 169 | 167,TOPN+NN100x50,0.7234,3045,3229,1591,NN100x50,,,25.0 170 | 168,TOPN+NB,0.6958,4424,3229,1591,NB,,,50.0 171 | 169,TOPN+LR,0.7511,4424,3229,1591,LR,,,50.0 172 | 170,TOPN+5NN,0.6468,4424,3229,1591,5NN,,,50.0 173 | 171,TOPN+2NN,0.6694,4424,3229,1591,2NN,,,50.0 174 | 172,TOPN+1KNN,0.6782,4424,3229,1591,1KNN,,,50.0 175 | 173,TOPN+LSVM,0.7316,4424,3229,1591,LSVM,,,50.0 176 | 174,TOPN+NN100x50,0.7436,4424,3229,1591,NN100x50,,,50.0 177 | 175,TOPN+NB,0.6914,5595,3229,1591,NB,,,100.0 178 | 176,TOPN+LR,0.7511,5595,3229,1591,LR,,,100.0 179 | 177,TOPN+5NN,0.643,5595,3229,1591,5NN,,,100.0 180 | 178,TOPN+2NN,0.6744,5595,3229,1591,2NN,,,100.0 181 | 179,TOPN+1KNN,0.6763,5595,3229,1591,1KNN,,,100.0 182 | 180,TOPN+LSVM,0.7272,5595,3229,1591,LSVM,,,100.0 183 | 181,TOPN+NN100x50,0.7473,5595,3229,1591,NN100x50,,,100.0 184 | 182,TOPN+NB,0.6958,6404,3229,1591,NB,,,250.0 185 | 183,TOPN+LR,0.7448,6404,3229,1591,LR,,,250.0 186 | 184,TOPN+5NN,0.6449,6404,3229,1591,5NN,,,250.0 187 | 185,TOPN+2NN,0.6725,6404,3229,1591,2NN,,,250.0 188 | 186,TOPN+1KNN,0.6738,6404,3229,1591,1KNN,,,250.0 189 | 187,TOPN+LSVM,0.7304,6404,3229,1591,LSVM,,,250.0 190 | 188,TOPN+NN100x50,0.7542,6404,3229,1591,NN100x50,,,250.0 191 | 189,TOPN+NB,0.6958,6654,3229,1591,NB,,,500.0 192 | 190,TOPN+LR,0.7436,6654,3229,1591,LR,,,500.0 193 | 191,TOPN+5NN,0.6417,6654,3229,1591,5NN,,,500.0 194 | 192,TOPN+2NN,0.6725,6654,3229,1591,2NN,,,500.0 195 | 193,TOPN+1KNN,0.6706,6654,3229,1591,1KNN,,,500.0 196 | 194,TOPN+LSVM,0.7291,6654,3229,1591,LSVM,,,500.0 197 | 195,TOPN+NN100x50,0.7486,6654,3229,1591,NN100x50,,,500.0 198 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/20newsgroups/20NEWSGROUPS_evaluation_results.csv: -------------------------------------------------------------------------------- 1 | ,Method,Accuracy,Number of features,Train size,Test size,Classifier,variance thershold,kbest,top_n 2 | 0,BOW+NB,0.9361,62384,7778,3832,NB,,, 3 | 1,BOW+LR,0.9387,62384,7778,3832,LR,,, 4 | 2,BOW+5NN,0.643,62384,7778,3832,5NN,,, 5 | 3,BOW+2NN,0.7557,62384,7778,3832,2NN,,, 6 | 4,BOW+1KNN,0.7597,62384,7778,3832,1KNN,,, 7 | 5,BOW+LSVM,0.9408,62384,7778,3832,LSVM,,, 8 | 6,BOW+NN100x50,0.9546,62384,7778,3832,NN100x50,,, 9 | 7,BOW+NN500x250,0.959,62384,7778,3832,NN500x250,,, 10 | 8,META+NB,0.9387,14907,7778,3832,NB,,, 11 | 9,META+LR,0.9376,14907,7778,3832,LR,,, 12 | 10,META+5NN,0.6542,14907,7778,3832,5NN,,, 13 | 11,META+2NN,0.7607,14907,7778,3832,2NN,,, 14 | 12,META+1KNN,0.7638,14907,7778,3832,1KNN,,, 15 | 13,META+LSVM,0.9408,14907,7778,3832,LSVM,,, 16 | 14,META+NN100x50,0.952,14907,7778,3832,NN100x50,,, 17 | 15,META+NN500x250,0.9562,14907,7778,3832,NN500x250,,, 18 | 16,LVAR+NB,0.94,29992,7778,3832,NB,0.0005,, 19 | 17,LVAR+LR,0.9384,29992,7778,3832,LR,0.0005,, 20 | 18,LVAR+5NN,0.6341,29992,7778,3832,5NN,0.0005,, 21 | 19,LVAR+2NN,0.7526,29992,7778,3832,2NN,0.0005,, 22 | 20,LVAR+1KNN,0.7466,29992,7778,3832,1KNN,0.0005,, 23 | 21,LVAR+LSVM,0.9368,29992,7778,3832,LSVM,0.0005,, 24 | 22,LVAR+NN100x50,0.9541,29992,7778,3832,NN100x50,0.0005,, 25 | 23,LVAR+NN500x250,0.9609,29992,7778,3832,NN500x250,0.0005,, 26 | 24,LVAR+NB,0.9363,20410,7778,3832,NB,0.001,, 27 | 25,LVAR+LR,0.9358,20410,7778,3832,LR,0.001,, 28 | 26,LVAR+5NN,0.6292,20410,7778,3832,5NN,0.001,, 29 | 27,LVAR+2NN,0.7461,20410,7778,3832,2NN,0.001,, 30 | 28,LVAR+1KNN,0.744,20410,7778,3832,1KNN,0.001,, 31 | 29,LVAR+LSVM,0.9303,20410,7778,3832,LSVM,0.001,, 32 | 30,LVAR+NN100x50,0.9517,20410,7778,3832,NN100x50,0.001,, 33 | 31,LVAR+NN500x250,0.9549,20410,7778,3832,NN500x250,0.001,, 34 | 32,LVAR+NB,0.9306,15779,7778,3832,NB,0.0015,, 35 | 33,LVAR+LR,0.9316,15779,7778,3832,LR,0.0015,, 36 | 34,LVAR+5NN,0.6626,15779,7778,3832,5NN,0.0015,, 37 | 35,LVAR+2NN,0.7445,15779,7778,3832,2NN,0.0015,, 38 | 36,LVAR+1KNN,0.7456,15779,7778,3832,1KNN,0.0015,, 39 | 37,LVAR+LSVM,0.929,15779,7778,3832,LSVM,0.0015,, 40 | 38,LVAR+NN100x50,0.9502,15779,7778,3832,NN100x50,0.0015,, 41 | 39,LVAR+NN500x250,0.9538,15779,7778,3832,NN500x250,0.0015,, 42 | 40,LVAR+NB,0.9282,13521,7778,3832,NB,0.002,, 43 | 41,LVAR+LR,0.9308,13521,7778,3832,LR,0.002,, 44 | 42,LVAR+5NN,0.6827,13521,7778,3832,5NN,0.002,, 45 | 43,LVAR+2NN,0.7435,13521,7778,3832,2NN,0.002,, 46 | 44,LVAR+1KNN,0.7456,13521,7778,3832,1KNN,0.002,, 47 | 45,LVAR+LSVM,0.9251,13521,7778,3832,LSVM,0.002,, 48 | 46,LVAR+NN100x50,0.9434,13521,7778,3832,NN100x50,0.002,, 49 | 47,LVAR+NN500x250,0.9512,13521,7778,3832,NN500x250,0.002,, 50 | 48,LVAR+NB,0.9212,10469,7778,3832,NB,0.003,, 51 | 49,LVAR+LR,0.9277,10469,7778,3832,LR,0.003,, 52 | 50,LVAR+5NN,0.6699,10469,7778,3832,5NN,0.003,, 53 | 51,LVAR+2NN,0.7469,10469,7778,3832,2NN,0.003,, 54 | 52,LVAR+1KNN,0.7479,10469,7778,3832,1KNN,0.003,, 55 | 53,LVAR+LSVM,0.9204,10469,7778,3832,LSVM,0.003,, 56 | 54,LVAR+NN100x50,0.9442,10469,7778,3832,NN100x50,0.003,, 57 | 55,LVAR+NN500x250,0.9489,10469,7778,3832,NN500x250,0.003,, 58 | 56,LVAR+NB,0.9152,8787,7778,3832,NB,0.004,, 59 | 57,LVAR+LR,0.9217,8787,7778,3832,LR,0.004,, 60 | 58,LVAR+5NN,0.6508,8787,7778,3832,5NN,0.004,, 61 | 59,LVAR+2NN,0.7427,8787,7778,3832,2NN,0.004,, 62 | 60,LVAR+1KNN,0.7477,8787,7778,3832,1KNN,0.004,, 63 | 61,LVAR+LSVM,0.9113,8787,7778,3832,LSVM,0.004,, 64 | 62,LVAR+NN100x50,0.9241,8787,7778,3832,NN100x50,0.004,, 65 | 63,LVAR+NN500x250,0.9431,8787,7778,3832,NN500x250,0.004,, 66 | 64,LVAR+NB,0.9108,7651,7778,3832,NB,0.005,, 67 | 65,LVAR+LR,0.9186,7651,7778,3832,LR,0.005,, 68 | 66,LVAR+5NN,0.6647,7651,7778,3832,5NN,0.005,, 69 | 67,LVAR+2NN,0.7445,7651,7778,3832,2NN,0.005,, 70 | 68,LVAR+1KNN,0.7461,7651,7778,3832,1KNN,0.005,, 71 | 69,LVAR+LSVM,0.9094,7651,7778,3832,LSVM,0.005,, 72 | 70,LVAR+NN100x50,0.934,7651,7778,3832,NN100x50,0.005,, 73 | 71,LVAR+NN500x250,0.9415,7651,7778,3832,NN500x250,0.005,, 74 | 72,LVAR+NB,0.8977,4880,7778,3832,NB,0.01,, 75 | 73,LVAR+LR,0.9076,4880,7778,3832,LR,0.01,, 76 | 74,LVAR+5NN,0.6942,4880,7778,3832,5NN,0.01,, 77 | 75,LVAR+2NN,0.7484,4880,7778,3832,2NN,0.01,, 78 | 76,LVAR+1KNN,0.7474,4880,7778,3832,1KNN,0.01,, 79 | 77,LVAR+LSVM,0.8883,4880,7778,3832,LSVM,0.01,, 80 | 78,LVAR+NN100x50,0.916,4880,7778,3832,NN100x50,0.01,, 81 | 79,LVAR+NN500x250,0.9233,4880,7778,3832,NN500x250,0.01,, 82 | 80,KBEST+NB,0.8142,1000,7778,3832,NB,,1000.0, 83 | 81,KBEST+LR,0.8411,1000,7778,3832,LR,,1000.0, 84 | 82,KBEST+5NN,0.7109,1000,7778,3832,5NN,,1000.0, 85 | 83,KBEST+2NN,0.7401,1000,7778,3832,2NN,,1000.0, 86 | 84,KBEST+1KNN,0.738,1000,7778,3832,1KNN,,1000.0, 87 | 85,KBEST+LSVM,0.8419,1000,7778,3832,LSVM,,1000.0, 88 | 86,KBEST+NN100x50,0.8398,1000,7778,3832,NN100x50,,1000.0, 89 | 87,KBEST+NN500x250,0.858,1000,7778,3832,NN500x250,,1000.0, 90 | 88,KBEST+NB,0.852,2000,7778,3832,NB,,2000.0, 91 | 89,KBEST+LR,0.8805,2000,7778,3832,LR,,2000.0, 92 | 90,KBEST+5NN,0.7049,2000,7778,3832,5NN,,2000.0, 93 | 91,KBEST+2NN,0.7557,2000,7778,3832,2NN,,2000.0, 94 | 92,KBEST+1KNN,0.7576,2000,7778,3832,1KNN,,2000.0, 95 | 93,KBEST+LSVM,0.8638,2000,7778,3832,LSVM,,2000.0, 96 | 94,KBEST+NN100x50,0.8779,2000,7778,3832,NN100x50,,2000.0, 97 | 95,KBEST+NN500x250,0.8847,2000,7778,3832,NN500x250,,2000.0, 98 | 96,KBEST+NB,0.8706,3000,7778,3832,NB,,3000.0, 99 | 97,KBEST+LR,0.8935,3000,7778,3832,LR,,3000.0, 100 | 98,KBEST+5NN,0.6965,3000,7778,3832,5NN,,3000.0, 101 | 99,KBEST+2NN,0.757,3000,7778,3832,2NN,,3000.0, 102 | 100,KBEST+1KNN,0.7615,3000,7778,3832,1KNN,,3000.0, 103 | 101,KBEST+LSVM,0.8758,3000,7778,3832,LSVM,,3000.0, 104 | 102,KBEST+NN100x50,0.8948,3000,7778,3832,NN100x50,,3000.0, 105 | 103,KBEST+NN500x250,0.9014,3000,7778,3832,NN500x250,,3000.0, 106 | 104,KBEST+NB,0.9001,5000,7778,3832,NB,,5000.0, 107 | 105,KBEST+LR,0.9084,5000,7778,3832,LR,,5000.0, 108 | 106,KBEST+5NN,0.721,5000,7778,3832,5NN,,5000.0, 109 | 107,KBEST+2NN,0.7565,5000,7778,3832,2NN,,5000.0, 110 | 108,KBEST+1KNN,0.7557,5000,7778,3832,1KNN,,5000.0, 111 | 109,KBEST+LSVM,0.8922,5000,7778,3832,LSVM,,5000.0, 112 | 110,KBEST+NN100x50,0.9154,5000,7778,3832,NN100x50,,5000.0, 113 | 111,KBEST+NN500x250,0.9272,5000,7778,3832,NN500x250,,5000.0, 114 | 112,KBEST+NB,0.9194,10000,7778,3832,NB,,10000.0, 115 | 113,KBEST+LR,0.9251,10000,7778,3832,LR,,10000.0, 116 | 114,KBEST+5NN,0.6803,10000,7778,3832,5NN,,10000.0, 117 | 115,KBEST+2NN,0.7516,10000,7778,3832,2NN,,10000.0, 118 | 116,KBEST+1KNN,0.7537,10000,7778,3832,1KNN,,10000.0, 119 | 117,KBEST+LSVM,0.9165,10000,7778,3832,LSVM,,10000.0, 120 | 118,KBEST+NN100x50,0.9405,10000,7778,3832,NN100x50,,10000.0, 121 | 119,KBEST+NN500x250,0.9444,10000,7778,3832,NN500x250,,10000.0, 122 | 120,KBEST+NB,0.9293,15000,7778,3832,NB,,15000.0, 123 | 121,KBEST+LR,0.9327,15000,7778,3832,LR,,15000.0, 124 | 122,KBEST+5NN,0.696,15000,7778,3832,5NN,,15000.0, 125 | 123,KBEST+2NN,0.7505,15000,7778,3832,2NN,,15000.0, 126 | 124,KBEST+1KNN,0.7466,15000,7778,3832,1KNN,,15000.0, 127 | 125,KBEST+LSVM,0.9241,15000,7778,3832,LSVM,,15000.0, 128 | 126,KBEST+NN100x50,0.9442,15000,7778,3832,NN100x50,,15000.0, 129 | 127,KBEST+NN500x250,0.9517,15000,7778,3832,NN500x250,,15000.0, 130 | 128,KBEST+NB,0.9332,20000,7778,3832,NB,,20000.0, 131 | 129,KBEST+LR,0.9353,20000,7778,3832,LR,,20000.0, 132 | 130,KBEST+5NN,0.7129,20000,7778,3832,5NN,,20000.0, 133 | 131,KBEST+2NN,0.7544,20000,7778,3832,2NN,,20000.0, 134 | 132,KBEST+1KNN,0.7537,20000,7778,3832,1KNN,,20000.0, 135 | 133,KBEST+LSVM,0.9327,20000,7778,3832,LSVM,,20000.0, 136 | 134,KBEST+NN100x50,0.9502,20000,7778,3832,NN100x50,,20000.0, 137 | 135,KBEST+NN500x250,0.9528,20000,7778,3832,NN500x250,,20000.0, 138 | 136,KBEST+NB,0.9374,25000,7778,3832,NB,,25000.0, 139 | 137,KBEST+LR,0.9384,25000,7778,3832,LR,,25000.0, 140 | 138,KBEST+5NN,0.697,25000,7778,3832,5NN,,25000.0, 141 | 139,KBEST+2NN,0.7544,25000,7778,3832,2NN,,25000.0, 142 | 140,KBEST+1KNN,0.7516,25000,7778,3832,1KNN,,25000.0, 143 | 141,KBEST+LSVM,0.9348,25000,7778,3832,LSVM,,25000.0, 144 | 142,KBEST+NN100x50,0.9556,25000,7778,3832,NN100x50,,25000.0, 145 | 143,KBEST+NN500x250,0.9538,25000,7778,3832,NN500x250,,25000.0, 146 | 144,KBEST+NB,0.9382,30000,7778,3832,NB,,30000.0, 147 | 145,KBEST+LR,0.9389,30000,7778,3832,LR,,30000.0, 148 | 146,KBEST+5NN,0.6464,30000,7778,3832,5NN,,30000.0, 149 | 147,KBEST+2NN,0.7521,30000,7778,3832,2NN,,30000.0, 150 | 148,KBEST+1KNN,0.7458,30000,7778,3832,1KNN,,30000.0, 151 | 149,KBEST+LSVM,0.9366,30000,7778,3832,LSVM,,30000.0, 152 | 150,KBEST+NN100x50,0.9564,30000,7778,3832,NN100x50,,30000.0, 153 | 151,KBEST+NN500x250,0.958,30000,7778,3832,NN500x250,,30000.0, 154 | 152,TOPN+NB,0.7996,982,7778,3832,NB,,,5.0 155 | 153,TOPN+LR,0.8233,982,7778,3832,LR,,,5.0 156 | 154,TOPN+5NN,0.6644,982,7778,3832,5NN,,,5.0 157 | 155,TOPN+2NN,0.7174,982,7778,3832,2NN,,,5.0 158 | 156,TOPN+1KNN,0.7231,982,7778,3832,1KNN,,,5.0 159 | 157,TOPN+LSVM,0.7871,982,7778,3832,LSVM,,,5.0 160 | 158,TOPN+NN100x50,0.8233,982,7778,3832,NN100x50,,,5.0 161 | 159,TOPN+NN500x250,0.8463,982,7778,3832,NN500x250,,,5.0 162 | 160,TOPN+NB,0.8854,2926,7778,3832,NB,,,10.0 163 | 161,TOPN+LR,0.9006,2926,7778,3832,LR,,,10.0 164 | 162,TOPN+5NN,0.7169,2926,7778,3832,5NN,,,10.0 165 | 163,TOPN+2NN,0.7667,2926,7778,3832,2NN,,,10.0 166 | 164,TOPN+1KNN,0.7735,2926,7778,3832,1KNN,,,10.0 167 | 165,TOPN+LSVM,0.8802,2926,7778,3832,LSVM,,,10.0 168 | 166,TOPN+NN100x50,0.9134,2926,7778,3832,NN100x50,,,10.0 169 | 167,TOPN+NN500x250,0.9196,2926,7778,3832,NN500x250,,,10.0 170 | 168,TOPN+NB,0.9058,5096,7778,3832,NB,,,15.0 171 | 169,TOPN+LR,0.917,5096,7778,3832,LR,,,15.0 172 | 170,TOPN+5NN,0.6986,5096,7778,3832,5NN,,,15.0 173 | 171,TOPN+2NN,0.7654,5096,7778,3832,2NN,,,15.0 174 | 172,TOPN+1KNN,0.7706,5096,7778,3832,1KNN,,,15.0 175 | 173,TOPN+LSVM,0.9045,5096,7778,3832,LSVM,,,15.0 176 | 174,TOPN+NN100x50,0.9314,5096,7778,3832,NN100x50,,,15.0 177 | 175,TOPN+NN500x250,0.9337,5096,7778,3832,NN500x250,,,15.0 178 | 176,TOPN+NB,0.9196,7548,7778,3832,NB,,,20.0 179 | 177,TOPN+LR,0.9269,7548,7778,3832,LR,,,20.0 180 | 178,TOPN+5NN,0.7059,7548,7778,3832,5NN,,,20.0 181 | 179,TOPN+2NN,0.7664,7548,7778,3832,2NN,,,20.0 182 | 180,TOPN+1KNN,0.7704,7548,7778,3832,1KNN,,,20.0 183 | 181,TOPN+LSVM,0.917,7548,7778,3832,LSVM,,,20.0 184 | 182,TOPN+NN100x50,0.9434,7548,7778,3832,NN100x50,,,20.0 185 | 183,TOPN+NN500x250,0.9468,7548,7778,3832,NN500x250,,,20.0 186 | 184,TOPN+NB,0.9269,10149,7778,3832,NB,,,25.0 187 | 185,TOPN+LR,0.9311,10149,7778,3832,LR,,,25.0 188 | 186,TOPN+5NN,0.7171,10149,7778,3832,5NN,,,25.0 189 | 187,TOPN+2NN,0.7701,10149,7778,3832,2NN,,,25.0 190 | 188,TOPN+1KNN,0.7756,10149,7778,3832,1KNN,,,25.0 191 | 189,TOPN+LSVM,0.9277,10149,7778,3832,LSVM,,,25.0 192 | 190,TOPN+NN100x50,0.947,10149,7778,3832,NN100x50,,,25.0 193 | 191,TOPN+NN500x250,0.9528,10149,7778,3832,NN500x250,,,25.0 194 | 192,TOPN+NB,0.9421,20942,7778,3832,NB,,,50.0 195 | 193,TOPN+LR,0.9384,20942,7778,3832,LR,,,50.0 196 | 194,TOPN+5NN,0.7192,20942,7778,3832,5NN,,,50.0 197 | 195,TOPN+2NN,0.7644,20942,7778,3832,2NN,,,50.0 198 | 196,TOPN+1KNN,0.7717,20942,7778,3832,1KNN,,,50.0 199 | 197,TOPN+LSVM,0.9379,20942,7778,3832,LSVM,,,50.0 200 | 198,TOPN+NN100x50,0.9554,20942,7778,3832,NN100x50,,,50.0 201 | 199,TOPN+NN500x250,0.9509,20942,7778,3832,NN500x250,,,50.0 202 | 200,TOPN+NB,0.9402,30793,7778,3832,NB,,,100.0 203 | 201,TOPN+LR,0.9402,30793,7778,3832,LR,,,100.0 204 | 202,TOPN+5NN,0.6934,30793,7778,3832,5NN,,,100.0 205 | 203,TOPN+2NN,0.7568,30793,7778,3832,2NN,,,100.0 206 | 204,TOPN+1KNN,0.7649,30793,7778,3832,1KNN,,,100.0 207 | 205,TOPN+LSVM,0.9384,30793,7778,3832,LSVM,,,100.0 208 | 206,TOPN+NN100x50,0.9575,30793,7778,3832,NN100x50,,,100.0 209 | 207,TOPN+NN500x250,0.9603,30793,7778,3832,NN500x250,,,100.0 210 | 208,TOPN+NB,0.9415,37281,7778,3832,NB,,,250.0 211 | 209,TOPN+LR,0.9402,37281,7778,3832,LR,,,250.0 212 | 210,TOPN+5NN,0.6908,37281,7778,3832,5NN,,,250.0 213 | 211,TOPN+2NN,0.755,37281,7778,3832,2NN,,,250.0 214 | 212,TOPN+1KNN,0.763,37281,7778,3832,1KNN,,,250.0 215 | 213,TOPN+LSVM,0.9392,37281,7778,3832,LSVM,,,250.0 216 | 214,TOPN+NN100x50,0.9551,37281,7778,3832,NN100x50,,,250.0 217 | 215,TOPN+NN500x250,0.9616,37281,7778,3832,NN500x250,,,250.0 218 | 216,TOPN+NB,0.9397,39694,7778,3832,NB,,,500.0 219 | 217,TOPN+LR,0.94,39694,7778,3832,LR,,,500.0 220 | 218,TOPN+5NN,0.6895,39694,7778,3832,5NN,,,500.0 221 | 219,TOPN+2NN,0.7547,39694,7778,3832,2NN,,,500.0 222 | 220,TOPN+1KNN,0.7625,39694,7778,3832,1KNN,,,500.0 223 | 221,TOPN+LSVM,0.9392,39694,7778,3832,LSVM,,,500.0 224 | 222,TOPN+NN100x50,0.9572,39694,7778,3832,NN100x50,,,500.0 225 | 223,TOPN+NN500x250,0.9577,39694,7778,3832,NN500x250,,,500.0 226 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | from sklearn.metrics import accuracy_score 3 | from sklearn.feature_extraction.text import CountVectorizer 4 | from sklearn.feature_extraction import DictVectorizer 5 | from sklearn.feature_selection import VarianceThreshold 6 | from sklearn.feature_selection import SelectKBest 7 | from sklearn.feature_selection import chi2 8 | from sklearn.feature_selection import SelectFromModel 9 | from sklearn.svm import LinearSVC 10 | from collections import Counter 11 | from GraphOfDocs import select 12 | from GraphOfDocs import config_experiments 13 | import seaborn as sns 14 | import matplotlib.pyplot as plt 15 | 16 | def benchmark_classifier(clf, X_train, y_train, X_test, y_test, round_digits = 4): 17 | clf.fit(X_train, y_train) 18 | y_pred = clf.predict(X_test) 19 | accuracy = accuracy_score(y_test, y_pred) 20 | accuracy = round(accuracy, round_digits) 21 | return clf, accuracy 22 | 23 | def generate_plots(df, show_only = True, output_dir = '', plots_prefix = 'plot'): 24 | unique_classifier_names = list(df['Classifier'].unique()) 25 | for clf in unique_classifier_names: 26 | df_tmp = df[df['Classifier'] == clf] 27 | lineplot = lambda data: sns.lineplot( 28 | x = "Number of features", 29 | y = "Accuracy", 30 | hue = "Method", 31 | style = "Method", 32 | markers = True, 33 | dashes = False, 34 | data = data 35 | ) 36 | 37 | if show_only: 38 | lineplot(df_tmp) 39 | plt.show() 40 | else: 41 | lineplot(df_tmp) 42 | plt.savefig(f'{output_dir}/{plots_prefix}_{clf}.png', dpi = 100) 43 | plt.clf() 44 | 45 | lineplot(df_tmp) 46 | plt.ylim(0, 1) 47 | plt.savefig(f'{output_dir}/{plots_prefix}_{clf}_0_1.png', dpi = 100) 48 | plt.clf() 49 | 50 | class GraphOfDocsClassifier: 51 | def __init__(self, doc_to_community_dict, doc_communities_dict, 52 | test_size = 0.33, random_state = 42): 53 | self.__test_size = test_size 54 | self.__random_state = random_state 55 | self.__doc_to_community_dict = doc_to_community_dict 56 | self.__doc_communities_dict = doc_communities_dict 57 | 58 | def calculate_accuracy(self, document_identifiers, results_table): 59 | _, test_docs = train_test_split(document_identifiers, 60 | test_size = self.__test_size, 61 | random_state = self.__random_state) 62 | test_docs = list(test_docs) 63 | class_true = [] 64 | class_pred = [] 65 | for test_doc in test_docs: 66 | community_id = self.__doc_to_community_dict[test_doc] 67 | community_docs = self.__doc_communities_dict[community_id] 68 | classes = [config_experiments.extract_file_class(doc) 69 | for doc in community_docs if doc != test_doc] 70 | 71 | correct_class = config_experiments.extract_file_class(test_doc) 72 | classified_class = Counter(classes).most_common(1)[0][0] 73 | class_true.append(correct_class) 74 | class_pred.append(classified_class) 75 | accuracy = round(accuracy_score(class_true, class_pred), 4) 76 | #print('Accuracy: %s' % (accuracy)) 77 | results_table.add_row(['Graph-of-docs Classifier', 78 | accuracy, 'N/A', 'N/A', len(test_docs), '']) 79 | 80 | class Evaluator: 81 | def __init__(self, test_size = 0.33, random_state = 42): 82 | self._test_size = test_size 83 | self._random_state = random_state 84 | def evaluate(self, x, y, **kwargs): 85 | raise NotImplemented('pure virtual') 86 | 87 | def _collect_evaluation_results(self, x_train_transformed, y_train, 88 | x_test_transformed, y_test, results_table, 89 | classifiers, method_prefix, extra_details = {}): 90 | train_size = x_train_transformed.shape[0] 91 | test_size = x_test_transformed.shape[0] 92 | number_of_features = x_test_transformed.shape[1] 93 | evaluation_results = [] 94 | for classifier in classifiers: 95 | _, accuracy = benchmark_classifier(classifier[1], x_train_transformed 96 | , y_train, x_test_transformed, y_test) 97 | # print('classifier:%s %s %s' % (classifier[0], accuracy, number_of_features)) 98 | method = method_prefix + classifier[0] 99 | results_table.add_row([method, accuracy, 100 | number_of_features, 101 | train_size, test_size, 102 | str(extra_details)]) 103 | classifier_results = { 104 | 'Method': method, 105 | 'Accuracy': accuracy, 106 | 'Number of features': number_of_features, 107 | 'Train size': train_size, 108 | 'Test size': test_size, 109 | } 110 | classifier_results.update(extra_details) 111 | classifier_results.update({'Classifier': classifier[0]}) 112 | evaluation_results.append(classifier_results) 113 | return evaluation_results 114 | 115 | class BOWEvaluator(Evaluator): 116 | def __init__(self, test_size = 0.33, random_state = 42): 117 | Evaluator.__init__(self, test_size, random_state) 118 | 119 | def evaluate(self, x, y, **kwargs): 120 | x_train, x_test, y_train, \ 121 | y_test = train_test_split(x, y, test_size = self._test_size, 122 | random_state = self._random_state) 123 | cv = CountVectorizer() 124 | x_train_transformed = cv.fit_transform(x_train) 125 | print(f'Number of features in BOWEvaluator: {x_train_transformed.shape[1]}') 126 | x_test_transformed = cv.transform(x_test) 127 | 128 | results_table = kwargs['results_table'] 129 | classifiers = kwargs['classifiers'] 130 | return self._collect_evaluation_results( 131 | x_train_transformed, y_train, x_test_transformed, 132 | y_test, results_table, classifiers, method_prefix='BOW+') 133 | 134 | class MetaFeatureSelectionEvaluator(Evaluator): 135 | def __init__(self, estimator_model = LinearSVC, 136 | test_size = 0.33, random_state = 42): 137 | Evaluator.__init__(self, test_size, random_state) 138 | self.__estimator_model = estimator_model 139 | 140 | def evaluate(self, x, y, **kwargs): 141 | x_train, x_test, y_train, \ 142 | y_test = train_test_split(x, y, 143 | test_size=self._test_size, 144 | random_state = self._random_state) 145 | cv = CountVectorizer() 146 | x_train_transformed = cv.fit_transform(x_train) 147 | x_test_transformed = cv.transform(x_test) 148 | selector = SelectFromModel(estimator = self.__estimator_model()) 149 | x_train_transformed = selector.fit_transform(x_train_transformed, y_train) 150 | x_test_transformed = selector.transform(x_test_transformed) 151 | 152 | results_table = kwargs['results_table'] 153 | classifiers = kwargs['classifiers'] 154 | return self._collect_evaluation_results( 155 | x_train_transformed, y_train, x_test_transformed, 156 | y_test, results_table, classifiers, method_prefix = 'META+') 157 | 158 | class LowVarianceFeatureSelectionEvaluator(Evaluator): 159 | def __init__(self, variance_threshold, test_size = 0.33, random_state = 42): 160 | Evaluator.__init__(self, test_size, random_state) 161 | self.__variance_threshold = variance_threshold 162 | 163 | def evaluate(self, x, y, **kwargs): 164 | x_train, x_test, y_train, \ 165 | y_test = train_test_split(x, y, 166 | test_size = self._test_size, 167 | random_state = self._random_state) 168 | cv = CountVectorizer() 169 | x_train_transformed = cv.fit_transform(x_train) 170 | x_test_transformed = cv.transform(x_test) 171 | selector = VarianceThreshold(threshold = self.__variance_threshold) 172 | x_train_transformed = selector.fit_transform(x_train_transformed, y_train) 173 | x_test_transformed = selector.transform(x_test_transformed) 174 | 175 | results_table = kwargs['results_table'] 176 | classifiers = kwargs['classifiers'] 177 | extra_details = { 178 | 'variance thershold': self.__variance_threshold 179 | } 180 | return self._collect_evaluation_results( 181 | x_train_transformed, y_train, x_test_transformed, 182 | y_test, results_table, classifiers, method_prefix='LVAR+', 183 | extra_details = extra_details) 184 | 185 | class SelectKBestFeatureSelectionEvaluator(Evaluator): 186 | def __init__(self, kbest, test_size = 0.33, random_state = 42): 187 | Evaluator.__init__(self, test_size, random_state) 188 | self.__kbest = kbest 189 | 190 | def evaluate(self, x, y, **kwargs): 191 | x_train, x_test, y_train, \ 192 | y_test = train_test_split(x, y, 193 | test_size = self._test_size, 194 | random_state = self._random_state) 195 | cv = CountVectorizer() 196 | x_train_transformed = cv.fit_transform(x_train) 197 | x_test_transformed = cv.transform(x_test) 198 | selector = SelectKBest(chi2, k = self.__kbest) 199 | x_train_transformed = selector.fit_transform(x_train_transformed, y_train) 200 | x_test_transformed = selector.transform(x_test_transformed) 201 | 202 | results_table = kwargs['results_table'] 203 | classifiers = kwargs['classifiers'] 204 | extra_details = { 205 | 'kbest': self.__kbest 206 | } 207 | return self._collect_evaluation_results( 208 | x_train_transformed, y_train, x_test_transformed, 209 | y_test, results_table, classifiers, method_prefix = 'KBEST+', 210 | extra_details=extra_details) 211 | 212 | class BigramsExtractionEvaluator(Evaluator): 213 | def __init__(self, test_size = 0.33, random_state = 42): 214 | Evaluator.__init__(self, test_size, random_state) 215 | 216 | def evaluate(self, x, y, **kwargs): 217 | x_train, x_test, y_train, \ 218 | y_test = train_test_split( 219 | x, y, test_size = self._test_size, 220 | random_state = self._random_state) 221 | cv = CountVectorizer(ngram_range = (2,2)) 222 | x_train_transformed = cv.fit_transform(x_train) 223 | x_test_transformed = cv.transform(x_test) 224 | 225 | results_table = kwargs['results_table'] 226 | classifiers = kwargs['classifiers'] 227 | return self._collect_evaluation_results( 228 | x_train_transformed, y_train, x_test_transformed, 229 | y_test, results_table, classifiers, 'BI+') 230 | 231 | class BigramsExtractionAndSelectKBestFeatureSelectionEvaluator(Evaluator): 232 | def __init__(self, kbest, test_size = 0.33, random_state = 42): 233 | Evaluator.__init__(self, test_size, random_state) 234 | self.__kbest = kbest 235 | 236 | def evaluate(self, x, y, **kwargs): 237 | x_train, x_test, y_train, \ 238 | y_test = train_test_split(x, y, 239 | test_size = self._test_size, 240 | random_state = self._random_state) 241 | cv = CountVectorizer(ngram_range = (2,2)) 242 | x_train_transformed = cv.fit_transform(x_train) 243 | x_test_transformed = cv.transform(x_test) 244 | selector = SelectKBest(chi2, k = self.__kbest) 245 | x_train_transformed = selector.fit_transform(x_train_transformed, y_train) 246 | x_test_transformed = selector.transform(x_test_transformed) 247 | 248 | results_table = kwargs['results_table'] 249 | classifiers = kwargs['classifiers'] 250 | extra_details = { 251 | 'kbest': self.__kbest 252 | } 253 | return self._collect_evaluation_results( 254 | x_train_transformed, y_train, x_test_transformed, y_test, 255 | results_table, classifiers, 'BI+KBEST+', extra_details = extra_details) 256 | 257 | class TopNOfEachCommunityEvaluator(Evaluator): 258 | def __init__(self, top_n, doc_to_community_dict, 259 | doc_communities_dict, test_size = 0.33, random_state = 42): 260 | Evaluator.__init__(self, test_size, random_state) 261 | self.__top_n = top_n 262 | self.__doc_to_community_dict = doc_to_community_dict 263 | self.__doc_communities_dict = doc_communities_dict 264 | 265 | def evaluate(self, x, y, **kwargs): 266 | df = kwargs['df'] 267 | positions_train = kwargs['positions_train'] 268 | train_docs = list(df.iloc[positions_train]['identifier']) 269 | database = kwargs['database'] 270 | vocabulary = [] 271 | community_id_to_tags = select.get_communities_tags( 272 | database, top_terms = self.__top_n) 273 | for doc in train_docs: 274 | for word in community_id_to_tags[self.__doc_to_community_dict[doc]]: 275 | vocabulary.append(word) 276 | vocabulary = list(set(vocabulary)) 277 | x_train, x_test, y_train, \ 278 | y_test = train_test_split(x, y, 279 | test_size = self._test_size, 280 | random_state=self._random_state) 281 | cv = CountVectorizer(vocabulary = vocabulary) 282 | x_train_transformed = cv.fit_transform(x_train, y_train) 283 | x_test_transformed = cv.transform(x_test) 284 | 285 | results_table = kwargs['results_table'] 286 | classifiers = kwargs['classifiers'] 287 | extra_details = { 288 | 'top_n': self.__top_n 289 | } 290 | return self._collect_evaluation_results( 291 | x_train_transformed, y_train, x_test_transformed, 292 | y_test, results_table, classifiers, 'TOPN+', extra_details = extra_details) 293 | 294 | # Ignore this class for the AIAI paper. Future work. 295 | class Docs2ComEvaluator(Evaluator): 296 | def __init__(self, top_n, doc_to_community_dict, 297 | doc_communities_dict, label_encoder, test_size = 0.33, random_state = 42): 298 | Evaluator.__init__(self, test_size, random_state) 299 | self.__top_n = top_n 300 | self.__doc_to_community_dict = doc_to_community_dict 301 | self.__doc_communities_dict = doc_communities_dict 302 | self.__label_encoder = label_encoder 303 | 304 | # ### [tag1, tag2, ... tagN] -> class (Do this for each community of docs) 305 | # TODO: Clean up this method. 306 | def evaluate(self, x, y, **kwargs): 307 | df = kwargs['df'] 308 | positions_train = kwargs['positions_train'] 309 | positions_test = kwargs['positions_test'] 310 | train_docs = list(df.iloc[positions_train]['identifier']) 311 | test_docs = list(df.iloc[positions_test]['identifier']) 312 | database = kwargs['database'] 313 | unique_community_ids = list(set([self.__doc_to_community_dict[doc] 314 | for doc in train_docs])) 315 | 316 | communities_y = [] 317 | communities_tags = [] 318 | for community_id in unique_community_ids: 319 | # Find the most common community class 320 | community_docs = self.__doc_communities_dict[community_id] 321 | classes = [config_experiments.extract_file_class(doc) 322 | for doc in community_docs if doc not in test_docs] 323 | classified_class = Counter(classes).most_common(1)[0][0] 324 | communities_y.append(classified_class) 325 | # Get the most important tags of each community. 326 | communities_tags.append(' '.join( 327 | select.get_community_tags( 328 | database, community_id, top_terms = self.__top_n))) 329 | 330 | cv = CountVectorizer() 331 | x_transformed = cv.fit_transform(communities_tags) 332 | communities_y_encoded = self.__label_encoder.transform(communities_y) 333 | x_test_docs = [] 334 | for doc in list(df[df['identifier'].isin(test_docs)]['text']): 335 | x_test_docs.append(' '.join(list(set(doc.split())))) 336 | x_test_docs_transformed = cv.transform(x_test_docs) 337 | y_test = list(df[df['identifier'].isin(test_docs)]['class_number']) 338 | 339 | results_table = kwargs['results_table'] 340 | classifiers = kwargs['classifiers'] 341 | extra_details = { 342 | 'top_n': self.__top_n 343 | } 344 | return self._collect_evaluation_results( 345 | x_transformed, communities_y_encoded, x_test_docs_transformed, y_test, 346 | results_table, classifiers, 'DOC2COM+', extra_details=extra_details) 347 | 348 | class GraphOfDocsBigramsExtractionEvaluator(Evaluator): 349 | def __init__(self, top_n = None, min_weight = None, 350 | test_size = 0.33, random_state = 42): 351 | Evaluator.__init__(self, test_size, random_state) 352 | self.__top_n = top_n 353 | self.__min_weight = min_weight 354 | 355 | def __generate_bigram_features(self, document_bigrams): 356 | if self.__top_n is not None: 357 | document_bigrams = document_bigrams[:self.__top_n] 358 | elif self.__min_weight is not None: 359 | document_bigrams = [bigram for bigram in document_bigrams 360 | if bigram[3] >= self.__min_weight] 361 | generated_bigrams = [] 362 | for bigram in document_bigrams: 363 | generated_bigrams.append(bigram[0] + '-' + bigram[1]) 364 | return generated_bigrams 365 | 366 | def __convert_documents_to_bigrams_dicts(self, database, document_ids): 367 | bigrams_dicts = [] 368 | for document_id in document_ids: 369 | bigrams = select.get_word_digrams_by_filename(database, document_id)[0][0] 370 | bigrams = self.__generate_bigram_features(bigrams) 371 | bigrams_dicts.append({bigram: 1 for bigram in bigrams}) 372 | return bigrams_dicts 373 | 374 | def evaluate(self, x, y, **kwargs): 375 | df = kwargs['df'] 376 | database = kwargs['database'] 377 | train_docs, test_docs, y_train, \ 378 | y_test = train_test_split( 379 | df['identifier'], y, test_size = self._test_size, 380 | random_state = self._random_state) 381 | train_docs = list(train_docs) 382 | test_docs = list(test_docs) 383 | 384 | train_documents = self.__convert_documents_to_bigrams_dicts(database, train_docs) 385 | test_documents = self.__convert_documents_to_bigrams_dicts(database, test_docs) 386 | 387 | dict_vectorizer = DictVectorizer() 388 | train_transformed = dict_vectorizer.fit_transform(train_documents) 389 | test_transformed = dict_vectorizer.transform(test_documents) 390 | 391 | results_table = kwargs['results_table'] 392 | classifiers = kwargs['classifiers'] 393 | extra_details = { 394 | 'top_n': self.__top_n, 395 | 'min_weight': self.__min_weight 396 | } 397 | prefix = 'TOP_N' if self.__top_n else 'MIN_WEIGHT' 398 | return self._collect_evaluation_results( 399 | train_transformed, y_train, test_transformed, 400 | y_test, results_table, classifiers, f'GOD+BI {prefix}+', 401 | extra_details = extra_details) 402 | -------------------------------------------------------------------------------- /GraphOfDocs/evaluation_results/lingspam/lingspam_results.txt: -------------------------------------------------------------------------------- 1 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 2 | !START OF THE EXPERIMENT! 3 | DATASET DIR PATH: C:\Users\USER\source\repos\GraphOfDocs\GraphOfDocs\datasets\lingspam 4 | MIN NUMBER OF DOCUMENTS PER SELECTED COMMUNITY: 2 5 | VARIANCE THRESHOLD: [0.0005, 0.001, 0.0015, 0.002, 0.003, 0.004, 0.005, 0.01] 6 | SELECT KBEST K: [1000, 2000, 3000, 4000, 5000, 6000, 7000, 10000, 14000] 7 | TOP N SELECTED COMMUNITY TERMS: [5, 10, 15, 20, 25, 50, 100, 250, 500] 8 | Number of selected documents: 816 9 | EXAMPLE OF THE PANDAS DATAFRAME 10 | identifier class class_number text 11 | 1530 msg_9-1191msg1.txt msg 0 workshop embodied conversational characters ca... 12 | 1769 msg_9-159msg2.txt msg 0 evaluation parsing systems evaluation parsing ... 13 | Number of unique classes: 2 14 | Number of features in BOWEvaluator:16695 15 | C:\Users\USER\source\repos\GraphOfDocs\GraphOfDocs\Virtual Environment\lib\site-packages\sklearn\svm\_base.py:947: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. 16 | "the number of iterations.", ConvergenceWarning) 17 | EXAMPLE OF THE EVALUATION RESULTS PANDAS DATAFRAME 18 | Method Accuracy Number of features Train size Test size Classifier variance thershold kbest top_n 19 | 0 BOW+NB 0.9963 16695 546 270 NB NaN NaN NaN 20 | 1 BOW+LR 1.0000 16695 546 270 LR NaN NaN NaN 21 | +--------------------------+----------+--------------------+------------+-----------+--------------------------------+ 22 | | Method | Accuracy | Number of features | Train size | Test size | Details | 23 | +--------------------------+----------+--------------------+------------+-----------+--------------------------------+ 24 | | TOPN+NN100x50 | 1.0 | 8975 | 546 | 270 | {'top_n': 250} | 25 | | TOPN+NN100x50 | 1.0 | 2274 | 546 | 270 | {'top_n': 50} | 26 | | TOPN+NN100x50 | 1.0 | 995 | 546 | 270 | {'top_n': 25} | 27 | | TOPN+NN100x50 | 1.0 | 758 | 546 | 270 | {'top_n': 20} | 28 | | TOPN+LR | 1.0 | 12104 | 546 | 270 | {'top_n': 500} | 29 | | TOPN+LR | 1.0 | 8975 | 546 | 270 | {'top_n': 250} | 30 | | TOPN+LR | 1.0 | 4443 | 546 | 270 | {'top_n': 100} | 31 | | TOPN+LR | 1.0 | 2274 | 546 | 270 | {'top_n': 50} | 32 | | TOPN+LR | 1.0 | 995 | 546 | 270 | {'top_n': 25} | 33 | | TOPN+LR | 1.0 | 758 | 546 | 270 | {'top_n': 20} | 34 | | TOPN+LR | 1.0 | 526 | 546 | 270 | {'top_n': 15} | 35 | | TOPN+LR | 1.0 | 296 | 546 | 270 | {'top_n': 10} | 36 | | TOPN+LR | 1.0 | 120 | 546 | 270 | {'top_n': 5} | 37 | | META+NN100x50 | 1.0 | 2509 | 546 | 270 | {} | 38 | | META+LSVM | 1.0 | 2509 | 546 | 270 | {} | 39 | | META+LR | 1.0 | 2509 | 546 | 270 | {} | 40 | | LVAR+NN100x50 | 1.0 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 41 | | LVAR+NN100x50 | 1.0 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 42 | | LVAR+LSVM | 1.0 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 43 | | LVAR+LSVM | 1.0 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 44 | | LVAR+LSVM | 1.0 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 45 | | LVAR+LR | 1.0 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 46 | | LVAR+LR | 1.0 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 47 | | LVAR+LR | 1.0 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 48 | | LVAR+LR | 1.0 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 49 | | LVAR+LR | 1.0 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 50 | | LVAR+LR | 1.0 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 51 | | LVAR+LR | 1.0 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 52 | | KBEST+NN100x50 | 1.0 | 1000 | 546 | 270 | {'kbest': 1000} | 53 | | KBEST+LSVM | 1.0 | 1000 | 546 | 270 | {'kbest': 1000} | 54 | | KBEST+LR | 1.0 | 14000 | 546 | 270 | {'kbest': 14000} | 55 | | KBEST+LR | 1.0 | 10000 | 546 | 270 | {'kbest': 10000} | 56 | | KBEST+LR | 1.0 | 1000 | 546 | 270 | {'kbest': 1000} | 57 | | Graph-of-docs Classifier | 1.0 | N/A | N/A | 270 | | 58 | | BOW+LSVM | 1.0 | 16695 | 546 | 270 | {} | 59 | | BOW+LR | 1.0 | 16695 | 546 | 270 | {} | 60 | | TOPN+NN100x50 | 0.9963 | 12104 | 546 | 270 | {'top_n': 500} | 61 | | TOPN+NN100x50 | 0.9963 | 4443 | 546 | 270 | {'top_n': 100} | 62 | | TOPN+NN100x50 | 0.9963 | 120 | 546 | 270 | {'top_n': 5} | 63 | | TOPN+NB | 0.9963 | 12104 | 546 | 270 | {'top_n': 500} | 64 | | TOPN+NB | 0.9963 | 8975 | 546 | 270 | {'top_n': 250} | 65 | | TOPN+NB | 0.9963 | 4443 | 546 | 270 | {'top_n': 100} | 66 | | TOPN+NB | 0.9963 | 2274 | 546 | 270 | {'top_n': 50} | 67 | | TOPN+LSVM | 0.9963 | 12104 | 546 | 270 | {'top_n': 500} | 68 | | TOPN+LSVM | 0.9963 | 8975 | 546 | 270 | {'top_n': 250} | 69 | | TOPN+LSVM | 0.9963 | 4443 | 546 | 270 | {'top_n': 100} | 70 | | TOPN+LSVM | 0.9963 | 2274 | 546 | 270 | {'top_n': 50} | 71 | | TOPN+LSVM | 0.9963 | 995 | 546 | 270 | {'top_n': 25} | 72 | | TOPN+LSVM | 0.9963 | 758 | 546 | 270 | {'top_n': 20} | 73 | | TOPN+LSVM | 0.9963 | 526 | 546 | 270 | {'top_n': 15} | 74 | | TOPN+LSVM | 0.9963 | 296 | 546 | 270 | {'top_n': 10} | 75 | | TOPN+LSVM | 0.9963 | 120 | 546 | 270 | {'top_n': 5} | 76 | | META+NB | 0.9963 | 2509 | 546 | 270 | {} | 77 | | LVAR+NN100x50 | 0.9963 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 78 | | LVAR+NN100x50 | 0.9963 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 79 | | LVAR+NN100x50 | 0.9963 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 80 | | LVAR+NN100x50 | 0.9963 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 81 | | LVAR+NN100x50 | 0.9963 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 82 | | LVAR+NN100x50 | 0.9963 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 83 | | LVAR+NB | 0.9963 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 84 | | LVAR+NB | 0.9963 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 85 | | LVAR+NB | 0.9963 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 86 | | LVAR+NB | 0.9963 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 87 | | LVAR+NB | 0.9963 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 88 | | LVAR+NB | 0.9963 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 89 | | LVAR+NB | 0.9963 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 90 | | LVAR+LSVM | 0.9963 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 91 | | LVAR+LSVM | 0.9963 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 92 | | LVAR+LSVM | 0.9963 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 93 | | LVAR+LSVM | 0.9963 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 94 | | LVAR+LSVM | 0.9963 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 95 | | LVAR+LR | 0.9963 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 96 | | KBEST+NN100x50 | 0.9963 | 14000 | 546 | 270 | {'kbest': 14000} | 97 | | KBEST+NN100x50 | 0.9963 | 10000 | 546 | 270 | {'kbest': 10000} | 98 | | KBEST+NN100x50 | 0.9963 | 2000 | 546 | 270 | {'kbest': 2000} | 99 | | KBEST+NB | 0.9963 | 14000 | 546 | 270 | {'kbest': 14000} | 100 | | KBEST+NB | 0.9963 | 10000 | 546 | 270 | {'kbest': 10000} | 101 | | KBEST+NB | 0.9963 | 7000 | 546 | 270 | {'kbest': 7000} | 102 | | KBEST+NB | 0.9963 | 6000 | 546 | 270 | {'kbest': 6000} | 103 | | KBEST+NB | 0.9963 | 5000 | 546 | 270 | {'kbest': 5000} | 104 | | KBEST+NB | 0.9963 | 4000 | 546 | 270 | {'kbest': 4000} | 105 | | KBEST+NB | 0.9963 | 3000 | 546 | 270 | {'kbest': 3000} | 106 | | KBEST+LSVM | 0.9963 | 14000 | 546 | 270 | {'kbest': 14000} | 107 | | KBEST+LSVM | 0.9963 | 10000 | 546 | 270 | {'kbest': 10000} | 108 | | KBEST+LSVM | 0.9963 | 7000 | 546 | 270 | {'kbest': 7000} | 109 | | KBEST+LSVM | 0.9963 | 6000 | 546 | 270 | {'kbest': 6000} | 110 | | KBEST+LSVM | 0.9963 | 5000 | 546 | 270 | {'kbest': 5000} | 111 | | KBEST+LSVM | 0.9963 | 4000 | 546 | 270 | {'kbest': 4000} | 112 | | KBEST+LSVM | 0.9963 | 3000 | 546 | 270 | {'kbest': 3000} | 113 | | KBEST+LSVM | 0.9963 | 2000 | 546 | 270 | {'kbest': 2000} | 114 | | KBEST+LR | 0.9963 | 7000 | 546 | 270 | {'kbest': 7000} | 115 | | KBEST+LR | 0.9963 | 6000 | 546 | 270 | {'kbest': 6000} | 116 | | KBEST+LR | 0.9963 | 5000 | 546 | 270 | {'kbest': 5000} | 117 | | KBEST+LR | 0.9963 | 4000 | 546 | 270 | {'kbest': 4000} | 118 | | KBEST+LR | 0.9963 | 3000 | 546 | 270 | {'kbest': 3000} | 119 | | KBEST+LR | 0.9963 | 2000 | 546 | 270 | {'kbest': 2000} | 120 | | BOW+NN100x50 | 0.9963 | 16695 | 546 | 270 | {} | 121 | | BOW+NB | 0.9963 | 16695 | 546 | 270 | {} | 122 | | TOPN+NN100x50 | 0.9926 | 526 | 546 | 270 | {'top_n': 15} | 123 | | TOPN+NN100x50 | 0.9926 | 296 | 546 | 270 | {'top_n': 10} | 124 | | TOPN+5NN | 0.9926 | 120 | 546 | 270 | {'top_n': 5} | 125 | | KBEST+NN100x50 | 0.9926 | 7000 | 546 | 270 | {'kbest': 7000} | 126 | | KBEST+NN100x50 | 0.9926 | 6000 | 546 | 270 | {'kbest': 6000} | 127 | | KBEST+NN100x50 | 0.9926 | 5000 | 546 | 270 | {'kbest': 5000} | 128 | | KBEST+NN100x50 | 0.9926 | 4000 | 546 | 270 | {'kbest': 4000} | 129 | | KBEST+NN100x50 | 0.9926 | 3000 | 546 | 270 | {'kbest': 3000} | 130 | | KBEST+1KNN | 0.9889 | 3000 | 546 | 270 | {'kbest': 3000} | 131 | | TOPN+2NN | 0.9852 | 120 | 546 | 270 | {'top_n': 5} | 132 | | TOPN+1KNN | 0.9852 | 120 | 546 | 270 | {'top_n': 5} | 133 | | KBEST+2NN | 0.9852 | 4000 | 546 | 270 | {'kbest': 4000} | 134 | | KBEST+1KNN | 0.9852 | 4000 | 546 | 270 | {'kbest': 4000} | 135 | | TOPN+NB | 0.9815 | 526 | 546 | 270 | {'top_n': 15} | 136 | | TOPN+NB | 0.9815 | 120 | 546 | 270 | {'top_n': 5} | 137 | | LVAR+NB | 0.9815 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 138 | | KBEST+2NN | 0.9815 | 3000 | 546 | 270 | {'kbest': 3000} | 139 | | KBEST+2NN | 0.9815 | 1000 | 546 | 270 | {'kbest': 1000} | 140 | | KBEST+1KNN | 0.9815 | 1000 | 546 | 270 | {'kbest': 1000} | 141 | | TOPN+NB | 0.9778 | 995 | 546 | 270 | {'top_n': 25} | 142 | | TOPN+NB | 0.9778 | 758 | 546 | 270 | {'top_n': 20} | 143 | | TOPN+NB | 0.9778 | 296 | 546 | 270 | {'top_n': 10} | 144 | | KBEST+NB | 0.9778 | 2000 | 546 | 270 | {'kbest': 2000} | 145 | | KBEST+NB | 0.9778 | 1000 | 546 | 270 | {'kbest': 1000} | 146 | | KBEST+5NN | 0.9778 | 5000 | 546 | 270 | {'kbest': 5000} | 147 | | KBEST+5NN | 0.9741 | 2000 | 546 | 270 | {'kbest': 2000} | 148 | | KBEST+2NN | 0.9741 | 5000 | 546 | 270 | {'kbest': 5000} | 149 | | KBEST+2NN | 0.9741 | 2000 | 546 | 270 | {'kbest': 2000} | 150 | | KBEST+1KNN | 0.9741 | 5000 | 546 | 270 | {'kbest': 5000} | 151 | | KBEST+1KNN | 0.9741 | 2000 | 546 | 270 | {'kbest': 2000} | 152 | | TOPN+5NN | 0.9704 | 296 | 546 | 270 | {'top_n': 10} | 153 | | TOPN+2NN | 0.9667 | 526 | 546 | 270 | {'top_n': 15} | 154 | | TOPN+1KNN | 0.9667 | 526 | 546 | 270 | {'top_n': 15} | 155 | | KBEST+5NN | 0.9667 | 6000 | 546 | 270 | {'kbest': 6000} | 156 | | KBEST+5NN | 0.9667 | 3000 | 546 | 270 | {'kbest': 3000} | 157 | | KBEST+2NN | 0.9667 | 6000 | 546 | 270 | {'kbest': 6000} | 158 | | KBEST+1KNN | 0.9667 | 6000 | 546 | 270 | {'kbest': 6000} | 159 | | KBEST+5NN | 0.963 | 4000 | 546 | 270 | {'kbest': 4000} | 160 | | TOPN+1KNN | 0.9593 | 296 | 546 | 270 | {'top_n': 10} | 161 | | KBEST+5NN | 0.9593 | 1000 | 546 | 270 | {'kbest': 1000} | 162 | | TOPN+2NN | 0.9556 | 296 | 546 | 270 | {'top_n': 10} | 163 | | KBEST+2NN | 0.9556 | 7000 | 546 | 270 | {'kbest': 7000} | 164 | | KBEST+1KNN | 0.9556 | 7000 | 546 | 270 | {'kbest': 7000} | 165 | | TOPN+2NN | 0.9519 | 995 | 546 | 270 | {'top_n': 25} | 166 | | TOPN+2NN | 0.9519 | 758 | 546 | 270 | {'top_n': 20} | 167 | | TOPN+1KNN | 0.9519 | 995 | 546 | 270 | {'top_n': 25} | 168 | | TOPN+1KNN | 0.9519 | 758 | 546 | 270 | {'top_n': 20} | 169 | | TOPN+5NN | 0.9481 | 526 | 546 | 270 | {'top_n': 15} | 170 | | KBEST+5NN | 0.9481 | 7000 | 546 | 270 | {'kbest': 7000} | 171 | | TOPN+5NN | 0.9407 | 758 | 546 | 270 | {'top_n': 20} | 172 | | TOPN+2NN | 0.9333 | 4443 | 546 | 270 | {'top_n': 100} | 173 | | TOPN+2NN | 0.9333 | 2274 | 546 | 270 | {'top_n': 50} | 174 | | TOPN+1KNN | 0.9333 | 4443 | 546 | 270 | {'top_n': 100} | 175 | | TOPN+1KNN | 0.9333 | 2274 | 546 | 270 | {'top_n': 50} | 176 | | TOPN+5NN | 0.9259 | 995 | 546 | 270 | {'top_n': 25} | 177 | | TOPN+2NN | 0.9259 | 12104 | 546 | 270 | {'top_n': 500} | 178 | | TOPN+2NN | 0.9259 | 8975 | 546 | 270 | {'top_n': 250} | 179 | | TOPN+1KNN | 0.9259 | 12104 | 546 | 270 | {'top_n': 500} | 180 | | TOPN+1KNN | 0.9259 | 8975 | 546 | 270 | {'top_n': 250} | 181 | | META+2NN | 0.9222 | 2509 | 546 | 270 | {} | 182 | | META+1KNN | 0.9222 | 2509 | 546 | 270 | {} | 183 | | LVAR+2NN | 0.9222 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 184 | | LVAR+1KNN | 0.9222 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 185 | | KBEST+2NN | 0.9222 | 14000 | 546 | 270 | {'kbest': 14000} | 186 | | KBEST+2NN | 0.9222 | 10000 | 546 | 270 | {'kbest': 10000} | 187 | | KBEST+1KNN | 0.9222 | 14000 | 546 | 270 | {'kbest': 14000} | 188 | | KBEST+1KNN | 0.9222 | 10000 | 546 | 270 | {'kbest': 10000} | 189 | | LVAR+2NN | 0.9185 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 190 | | LVAR+2NN | 0.9185 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 191 | | LVAR+2NN | 0.9185 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 192 | | LVAR+2NN | 0.9185 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 193 | | LVAR+1KNN | 0.9185 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 194 | | LVAR+1KNN | 0.9185 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 195 | | LVAR+1KNN | 0.9185 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 196 | | LVAR+1KNN | 0.9185 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 197 | | LVAR+2NN | 0.9074 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 198 | | LVAR+2NN | 0.9074 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 199 | | LVAR+2NN | 0.9074 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 200 | | LVAR+1KNN | 0.9074 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 201 | | LVAR+1KNN | 0.9074 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 202 | | LVAR+1KNN | 0.9074 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 203 | | BOW+2NN | 0.9074 | 16695 | 546 | 270 | {} | 204 | | BOW+1KNN | 0.9074 | 16695 | 546 | 270 | {} | 205 | | TOPN+5NN | 0.8926 | 2274 | 546 | 270 | {'top_n': 50} | 206 | | LVAR+5NN | 0.8926 | 5464 | 546 | 270 | {'variance thershold': 0.01} | 207 | | KBEST+5NN | 0.8889 | 10000 | 546 | 270 | {'kbest': 10000} | 208 | | TOPN+5NN | 0.8778 | 4443 | 546 | 270 | {'top_n': 100} | 209 | | META+5NN | 0.8704 | 2509 | 546 | 270 | {} | 210 | | TOPN+5NN | 0.8481 | 8975 | 546 | 270 | {'top_n': 250} | 211 | | TOPN+5NN | 0.8407 | 12104 | 546 | 270 | {'top_n': 500} | 212 | | LVAR+5NN | 0.837 | 8234 | 546 | 270 | {'variance thershold': 0.005} | 213 | | LVAR+5NN | 0.837 | 8234 | 546 | 270 | {'variance thershold': 0.004} | 214 | | KBEST+5NN | 0.837 | 14000 | 546 | 270 | {'kbest': 14000} | 215 | | LVAR+5NN | 0.8333 | 16695 | 546 | 270 | {'variance thershold': 0.001} | 216 | | LVAR+5NN | 0.8333 | 16695 | 546 | 270 | {'variance thershold': 0.0015} | 217 | | LVAR+5NN | 0.8333 | 16695 | 546 | 270 | {'variance thershold': 0.0005} | 218 | | BOW+5NN | 0.8333 | 16695 | 546 | 270 | {} | 219 | | LVAR+5NN | 0.8296 | 11058 | 546 | 270 | {'variance thershold': 0.003} | 220 | | LVAR+5NN | 0.8296 | 11058 | 546 | 270 | {'variance thershold': 0.002} | 221 | +--------------------------+----------+--------------------+------------+-----------+--------------------------------+ 222 | Execution time: 263.13525189999996 223 | !END OF THE EXPERIMENT! 224 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% --------------------------------------------------------------------------------