├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── UGFraud ├── Demo │ ├── __init__.py │ ├── demo_pre.py │ ├── eval_Fraudar.py │ ├── eval_GANG.py │ ├── eval_Prior.py │ ├── eval_SVD.py │ ├── eval_SpEagle.py │ ├── eval_ZooBP.py │ ├── eval_fBox.py │ └── testing.py ├── Detector │ ├── Fraudar.py │ ├── GANG.py │ ├── MinTree.py │ ├── SVD.py │ ├── SpEagle.py │ ├── ZooBP.py │ ├── __init__.py │ └── fBox.py ├── Utils │ ├── __init__.py │ └── helper.py ├── Yelp_Data │ └── YelpChi │ │ ├── metadata.gz │ │ └── priors.pkl └── __init__.py ├── UGFraud_logo.png ├── reference ├── fbox.txt ├── fraudar.txt ├── gang.txt ├── speagle.txt ├── svd.txt └── zoobp.txt ├── requirements.txt ├── setup.py └── tests └── testing.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | **/*.ipynb_checkpoints/ 3 | **/*.ipynb/ 4 | **/__pycache__ 5 | /Detector/__pycache__ 6 | /Utils/__pycache__ 7 | __pycache__ 8 | __pycache__/ 9 | .idea 10 | .idea/ 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.7" 5 | # command to install dependencies 6 | install: 7 | - pip install -r requirements.txt 8 | script: 9 | - python tests/testing.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 |
3 | 4 | 5 | 6 |
7 |

8 |

9 | 10 | Building 11 | 12 | 13 | GitHub 14 | 15 | 16 | Downloads 17 | 18 | 19 | Pypi version 20 | 21 |

22 | 23 |

24 |

An Unsupervised Graph-based Toolbox for Fraud Detection 25 |

26 | 27 | **Introduction:** 28 | UGFraud is an unsupervised graph-based fraud detection toolbox that integrates several state-of-the-art graph-based fraud detection algorithms. It can be applied to bipartite graphs (e.g., user-product graph), and it can estimate the suspiciousness of both nodes and edges. The implemented models can be found [here](#implemented-models). 29 | 30 | The toolbox incorporates the Markov Random Field (MRF)-based algorithm, dense-block detection-based algorithm, and SVD-based algorithm. For MRF-based algorithms, the users only need the graph structure and the prior suspicious score of the nodes as the input. For other algorithms, the graph structure is the only input. 31 | 32 | Meanwhile, we have a [deep graph-based fraud detection toolbox](https://github.com/safe-graph/DGFraud) which implements state-of-the-art graph neural network-based fraud detectors. 33 | 34 | We welcome contributions on adding new fraud detectors and extending the features of the toolbox. Some of the planned features are listed in [TODO list](#todo-list). 35 | 36 | If you use the toolbox in your project, please cite the [paper](https://arxiv.org/abs/2006.06069) below and the [algorithms](#implemented-models) you used : 37 | ```bibtex 38 | @inproceedings{dou2020robust, 39 | title={Robust Spammer Detection by Nash Reinforcement Learning}, 40 | author={Dou, Yingtong and Ma, Guixiang and Yu, Philip S and Xie, Sihong}, 41 | booktitle={Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining}, 42 | year={2020} 43 | } 44 | ``` 45 | 46 | **Useful Resources** 47 | - [PyGOD: A Python Library for Graph Outlier Detection (Anomaly Detection)](https://github.com/pygod-team/pygod) 48 | - [DGFraud: A Deep Graph-based Fraud Detection Toolbox](https://github.com/safe-graph/DGFraud) 49 | - [Graph-based Fraud Detection Paper List](https://github.com/safe-graph/graph-fraud-detection-papers) 50 | - [Awesome Fraud Detection Papers](https://github.com/benedekrozemberczki/awesome-fraud-detection-papers) 51 | - [Attack and Defense Papers on Graph Data](https://github.com/safe-graph/graph-adversarial-learning-literature) 52 | - [PyOD: A Python Toolbox for Scalable Outlier Detection (Anomaly Detection)](https://github.com/yzhao062/pyod) 53 | - [PyODD: An End-to-end Outlier Detection System](https://github.com/datamllab/pyodds) 54 | - [DGL: Deep Graph Library](https://github.com/dmlc/dgl) 55 | - [Outlier Detection DataSets (ODDS)](http://odds.cs.stonybrook.edu/) 56 | 57 | **Table of Contents** 58 | - [Installation](#installation) 59 | - [User Guide](#user-guide) 60 | - [Implemented Models](#implemented-models) 61 | - [Model Comparison](#model-comparison) 62 | - [TODO List](#todo-list) 63 | - [How to Contribute](#how-to-contribute) 64 | 65 | 66 | ## Installation 67 | You can install UGFraud from `pypi`: 68 | 69 | ```bash 70 | pip install UGFraud 71 | ``` 72 | 73 | or download and install from `github`: 74 | 75 | ```bash 76 | git clone https://github.com/safe-graph/UGFraud.git 77 | cd UGFraud 78 | python setup.py install 79 | ``` 80 | 81 | ### Dataset 82 | The demo data is not the intact data (`rating` and `date` information are missing). The rating information is only used in ZooBP demo. If you need the intact date to play demo, please email [bdscsafegraph@gmail.com](mailto:bdscsafegraph@gmail.com) to download the intact data from [Yelp Spam Review Dataset](http://odds.cs.stonybrook.edu/yelpchi-dataset/). The `metadata.gz` file in `/UGFraud/Yelp_Data/YelpChi` includes: 83 | - `user_id`: 38063 number of users 84 | - `product_id`: 201 number of products 85 | - `rating`: from 1.0 (low) to 5.0 (high) 86 | - `label`: -1 is not spam, 1 is spam 87 | - `date`: data creation time 88 | 89 | 90 | ## User Guide 91 | 92 | ### Running the example code 93 | You can find the implemented models in `/UGFraud/Demo/` directory. For example, you can run fBox using: 94 | ```bash 95 | python eval_fBox.py 96 | ``` 97 | 98 | ### Running on your datasets 99 | Check out the `data_to_network_graph` function in `/UGFraud/Demo/demo_pre.py` to convert your data into the [networkx](https://networkx.github.io/documentation/stable/tutorial.html#creating-a-graph) graph. 100 | 101 | In order to use your own data, you have to provide the following information at least: 102 | * a dict of dict: 103 | ``` 104 | 'user_id':{ 105 | 'product_id': 106 | { 107 | 'label': 1 108 | } 109 | ``` 110 | * a dict of prior 111 | 112 | You can use `dict_to networkx(graph_dict)` function from `/Utils/helper.py` file to convert your graph_dict into a networkx graph. 113 | For more details, please see `data_to_network_graph.py`. 114 | 115 | ### The structure of code 116 | The `/UGFraud` repository is organized as follows: 117 | - `Demo/` contains the implemented models and the corresponding example code; 118 | - `Detector/` contains the basic models; 119 | - `Yelp_Data/` contains the necessary dataset files; 120 | - `Utils/` contains the every help functions. 121 | 122 | 123 | ## Implemented Models 124 | 125 | | Model | Paper | Venue | Reference | 126 | |-------|--------|--------|--------| 127 | | **SpEagle** | [Collective Opinion Spam Detection: Bridging Review Networks and Metadata](https://www.andrew.cmu.edu/user/lakoglu/pubs/15-kdd-collectiveopinionspam.pdf) | KDD 2015 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/speagle.txt) | 128 | | **GANG** | [GANG: Detecting Fraudulent Users in Online Social Networks via Guilt-by-Association on Directed Graph](https://ieeexplore.ieee.org/document/8215519) | ICDM 2017 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/gang.txt)| 129 | | **fBox** | [Spotting Suspicious Link Behavior with fBox: An Adversarial Perspective](https://arxiv.org/pdf/1410.3915.pdf) | ICDM 2014 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/fbox.txt) | 130 | | **Fraudar** | [FRAUDAR: Bounding Graph Fraud in the Face of Camouflage](https://bhooi.github.io/papers/fraudar_kdd16.pdf) | KDD 2016 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/fraudar.txt) | 131 | | **ZooBP** | [ZooBP: Belief Propagation for Heterogeneous Networks](http://www.vldb.org/pvldb/vol10/p625-eswaran.pdf) | VLDB 2017 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/zoobp.txt) | 132 | | **SVD** | [Singular value decomposition and least squares solutions](https://link.springer.com/content/pdf/10.1007/978-3-662-39778-7_10.pdf) | - |[BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/svd.txt) | 133 | | **Prior** | Evaluating suspicioueness based on prior information | - | - | 134 | 135 | 136 | ## Model Comparison 137 | | Model | Application | Graph Type | Model Type | 138 | |-------|--------|--------|-------| 139 | | **SpEagle** | Review Spam | Tripartite | MRF | 140 | | **GANG** | Social Sybil | Bipartite | MRF | 141 | | **fBox** | Social Fraudster | Bipartite | SVD | 142 | | **Fraudar** | Social Fraudster | Bipartite | Dense-block | 143 | | **ZooBP** | E-commerce Fraud | Tripartite | MRF | 144 | | **SVD** | Dimension Reduction | Bipartite | SVD | 145 | 146 | 147 | ## TODO List 148 | - Homogeneous graph implementation 149 | 150 | 151 | ## How to Contribute 152 | You are welcomed to contribute to this open-source toolbox. Currently, you can create issues or send email to [bdscsafegraph@gmail.com](mailto:bdscsafegraph@gmail.com) for inquiry. 153 | -------------------------------------------------------------------------------- /UGFraud/Demo/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /UGFraud/Demo/demo_pre.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Utils.helper import * 2 | import networkx as nx 3 | import sys 4 | import os 5 | import random 6 | import pickle as pkl 7 | sys.path.insert(0, os.path.abspath('../../')) 8 | 9 | 10 | def data_to_network_graph(data_path): 11 | # data source 12 | data_name = 'YelpChi' 13 | # prefix = '../Yelp_Data/' + data_name + '/' 14 | prefix = data_path + data_name + '/' 15 | metadata_filename = prefix + 'metadata.gz' 16 | Checksum = 'f454ce0a5f506e0be062dc8aefb76b25' 17 | AUTHORIZED = False 18 | 19 | # valid YelpChi data 20 | with gzip.open(metadata_filename, 'rb') as f: 21 | file_content = f.read() 22 | f.close() 23 | if Checksum == get_hash(file_content): 24 | AUTHORIZED = True 25 | else: 26 | print('-' * 80) 27 | print('The demo data is not the intact data, if you need intact data, please download from:') 28 | print('http://odds.cs.stonybrook.edu/yelpchi-dataset/') 29 | print('-' * 80) 30 | 31 | """ 32 | read the graph and node priors 33 | user_product_graph: {'201': [('0', 1)], ... } 34 | product_user_graph: {'0': [('201', 1), ('202', 1), ...], ...} 35 | 36 | """ 37 | user_product_graph, product_user_graph = read_graph_data(metadata_filename) 38 | user_ground_truth, review_ground_truth = create_ground_truth(user_product_graph) 39 | 40 | # load priors 41 | with open(prefix + 'priors.pkl', 'rb') as f: 42 | priors = pkl.load(f) 43 | 44 | # convert user_product_graph to dict of dict 45 | # graph_dict: {'201': {'0': {'rating': 1, 'label': 1, 'date': '2011-06-08'}},...} 46 | graph_dict = dict() 47 | for k, v in user_product_graph.items(): 48 | graph_dict[k] = dict() 49 | for line in v: 50 | if line[2] == -1: 51 | new_line_2 = 0 52 | else: 53 | new_line_2 = 1 54 | # if demo data is not intact, generate rating randomly 55 | if type(line[1]) is str: 56 | new_line_1 = random.choice([0, 1]) 57 | elif line[1] >= 4: 58 | new_line_1 = 1 59 | else: 60 | new_line_1 = 2 61 | graph_dict[k][line[0]] = {'rating': new_line_1, 'label': new_line_2, 'date': line[3]} 62 | 63 | # put graph_dict into networkx graph 64 | G = dict_to_networkx(graph_dict) 65 | 66 | # we also can convert the graph into dict of dicts 67 | dict_of_dicts = nx.to_dict_of_dicts(G) 68 | 69 | # organize nodes' attributes, attributes must be the dict of dicts: 70 | # for example: {'201': {'prior': 0.1997974972380755, 'types': 'user'}, ...} 71 | user_node_priors = priors[0] 72 | node_attr = dict() 73 | for k, v in user_node_priors.items(): 74 | node_attr[k] = {'prior': v, 'types': 'user', 'label': user_ground_truth[k]} 75 | # add nodes' new attributes to the graph 76 | add_attribute_to_graph(graph=G, attribute=node_attr, adding_type='node') 77 | prod_node_priors = priors[2] 78 | node_attr = dict() 79 | for k, v in prod_node_priors.items(): 80 | node_attr[k] = {'prior': v, 'types': 'prod'} 81 | # add nodes' new attributes to the graph 82 | add_attribute_to_graph(graph=G, attribute=node_attr, adding_type='node') 83 | 84 | # check new attributes 85 | G.nodes.get('201') 86 | 87 | # organize edges'attributes, attributes must be the dict of dicts: 88 | # for example: {('201', '0'): {'prior': 0.35048557119705304, 'types': 'review'}, ...} 89 | edge_priors = priors[1] 90 | edge_attr = dict() 91 | for k, v in edge_priors.items(): 92 | edge_attr[k] = {'prior': v, 'types': 'review'} 93 | # add edges' new attributes to the graph 94 | add_attribute_to_graph(graph=G, attribute=edge_attr, adding_type='edge') 95 | # check new attributes 96 | G.edges.get(('201', '0')) 97 | 98 | # save graph data into json 99 | graph_name = 'Yelp_graph_data.json' 100 | save_graph(graph=G, graph_name=graph_name) 101 | 102 | # load json into graph 103 | loaded_G = load_graph(graph_name) 104 | 105 | 106 | if __name__ == '__main__': 107 | data_path = '../Yelp_Data/' 108 | data_to_network_graph(data_path) 109 | -------------------------------------------------------------------------------- /UGFraud/Demo/eval_Fraudar.py: -------------------------------------------------------------------------------- 1 | """ 2 | 'FRAUDAR: Bounding Graph Fraud in the Face of camouflage' 3 | Spot fraudsters in the presence of camouflage or hijacked accounts. An algorithm that is camouflage-resistant, 4 | provides upper bounds on the effectiveness of fraudsters, and the algorithm is effective in real-world data. 5 | Article: https://bhooi.github.io/papers/fraudar_kdd16.pdf 6 | """ 7 | 8 | from UGFraud.Utils.helper import * 9 | from UGFraud.Detector.Fraudar import * 10 | import copy as cp 11 | import sys 12 | import os 13 | sys.path.insert(0, os.path.abspath('../../')) 14 | 15 | 16 | def listToSparseMatrix(edgesSource, edgesDest): 17 | m = max(edgesSource) + 1 18 | n = max(edgesDest) + 1 19 | M = sparse.coo_matrix(([1] * len(edgesSource), (edgesSource, edgesDest)), shape=(m, n)) 20 | M1 = M > 0 21 | return M1.astype('int') 22 | 23 | 24 | @timer 25 | def runFraudar(graph, multiple=0): 26 | new_upriors = node_attr_filter(graph, 'types', 'user', 'prior') 27 | new_rpriors = edge_attr_filter(graph, 'types', 'review', 'prior') 28 | # print('Start detection on the new graph with Fraudar') 29 | user_to_product = {} 30 | prod_to_user = {} 31 | u_id_dict = node_attr_filter(graph, 'types', 'user', 'types') 32 | for u_id in u_id_dict.keys(): 33 | if u_id not in user_to_product: 34 | user_to_product[u_id] = [] 35 | for p_id in graph[u_id].keys(): 36 | if p_id not in prod_to_user: 37 | prod_to_user[p_id] = [] 38 | user_to_product[u_id].append(p_id) 39 | prod_to_user[p_id].append(u_id) 40 | u_id2idx = {} 41 | p_id2idx = {} 42 | idx2u_id = {} 43 | idx2p_id = {} 44 | i = 0 45 | for u_id in user_to_product.keys(): 46 | u_id2idx[u_id] = i 47 | idx2u_id[i] = u_id 48 | i += 1 49 | 50 | i = 0 51 | for p_id in prod_to_user.keys(): 52 | p_id2idx[p_id] = i 53 | idx2p_id[i] = p_id 54 | i += 1 55 | 56 | edgesSource = [] 57 | edgesDest = [] 58 | for u_id in u_id_dict.keys(): 59 | for p_id in graph[u_id].keys(): 60 | edgesSource.append(u_id2idx[u_id]) 61 | edgesDest.append(p_id2idx[p_id]) 62 | M = listToSparseMatrix(edgesSource, edgesDest) 63 | # print("finished reading data ") 64 | 65 | if multiple == 0: 66 | # detect all dense blocks 67 | res = detect_blocks(M, logWeightedAveDegree) 68 | else: 69 | # detect the top #multiple dense blocks 70 | res = detectMultiple(M, logWeightedAveDegree, multiple) 71 | 72 | detected_users = {} 73 | weight_dict = {} 74 | for lwRes in res: 75 | detected_u_idx = lwRes[0][0] 76 | detected_p_idx = lwRes[0][1] 77 | weight = lwRes[1] 78 | weight_dict[weight] = weight 79 | for i in detected_u_idx: 80 | uid_tmp = idx2u_id[i] 81 | if uid_tmp not in detected_users.keys(): 82 | detected_users[uid_tmp] = weight 83 | 84 | max_den = res[0][1] 85 | min_den = res[-1][1] 86 | den_interval = max_den - min_den 87 | 88 | ranked_rpriors = [(review, new_rpriors[review]) for review in new_rpriors.keys()] 89 | ranked_rpriors = sorted(ranked_rpriors, reverse=True, key=lambda x: x[1]) 90 | r_max, r_mean, r_min = ranked_rpriors[0][1], ranked_rpriors[int(len(ranked_rpriors) / 2)][1], ranked_rpriors[-1][1] 91 | aux_rpriors = cp.deepcopy(new_rpriors) 92 | for i, p in aux_rpriors.items(): 93 | new_rpriors[i] = (p - r_min) / (r_max - r_min) 94 | 95 | user_density = {} 96 | for u in new_upriors.keys(): 97 | if u in detected_users.keys(): 98 | user_density[u] = (detected_users[u] - min_den) / den_interval 99 | else: 100 | user_density[u] = 1e-6 101 | 102 | user_prob = {} 103 | review_prob = {} 104 | for review in new_rpriors.keys(): 105 | review_prob.update({review: 1e-6}) 106 | user_prob.update({review[0]: 1e-6}) 107 | print(len(detected_users)) 108 | print(detected_users['302']) 109 | 110 | for user in detected_users.keys(): 111 | user_prob.update({user: user_density[user]}) 112 | for prod in graph[user].keys(): 113 | review_prob.update({(user, prod): user_density[user]}) 114 | 115 | return user_prob, review_prob 116 | 117 | 118 | if __name__ == '__main__': 119 | # data source 120 | file_name = 'Yelp_graph_data.json' 121 | G = load_graph(file_name) 122 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 123 | 124 | # run Fraudar on the reviews 125 | userBelief, reviewBelief = runFraudar(G, multiple=0) 126 | reviewBelief = scale_value(reviewBelief) 127 | 128 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 129 | print('review AUC = {}'.format(review_AUC)) 130 | print('review AP = {}'.format(review_AP)) 131 | 132 | 133 | -------------------------------------------------------------------------------- /UGFraud/Demo/eval_GANG.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Detector.GANG import * 2 | import sys 3 | import os 4 | sys.path.insert(0, os.path.abspath('../../')) 5 | 6 | 7 | if __name__ == '__main__': 8 | # data source 9 | file_name = 'Yelp_graph_data.json' 10 | G = load_graph(file_name) 11 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label') 12 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 13 | 14 | # add semi-supervised user information / threshold 15 | sup_per = 0.1 16 | 17 | # run GANG model 18 | model = GANG(G, user_ground_truth, sup_per, nor_flg=True, sup_flg=False) 19 | 20 | # run Linearized Belief Propagation on product-user matrix with 1000 iterations 21 | iteration = 1000 22 | model.pu_lbp(iteration) 23 | userBelief, _, reviewBelief = model.classify() 24 | reviewBelief = scale_value(reviewBelief) 25 | 26 | # evaluation 27 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 28 | print('review AUC = {}'.format(review_AUC)) 29 | print('review AP = {}'.format(review_AP)) 30 | -------------------------------------------------------------------------------- /UGFraud/Demo/eval_Prior.py: -------------------------------------------------------------------------------- 1 | from Utils.helper import * 2 | 3 | if __name__ == '__main__': 4 | # data source 5 | file_name = 'Yelp_graph_data.json' 6 | G = load_graph(file_name) 7 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 8 | 9 | # normalize the review prior as the review suspicious belief 10 | rpriors = edge_attr_filter(G, 'types', 'review', 'prior') 11 | reviewBelief = scale_value(rpriors) 12 | 13 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 14 | print('review AUC = {}'.format(review_AUC)) 15 | print('review AP = {}'.format(review_AP)) 16 | -------------------------------------------------------------------------------- /UGFraud/Demo/eval_SVD.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Detector.SVD import * 2 | import sys 3 | import os 4 | sys.path.insert(0, os.path.abspath('../../')) 5 | 6 | if __name__ == '__main__': 7 | # data source 8 | file_name = 'Yelp_graph_data.json' 9 | G = load_graph(file_name) 10 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label') 11 | 12 | percent = 0.9 13 | model = SVD(G) 14 | svd_output = model.run(percent) 15 | result = model.evaluate_SVD(svd_output, G) 16 | index = list(map(str, map(int, result[0]))) 17 | userBelief = dict(zip(index, result[1])) 18 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief) 19 | print('review AUC = {}'.format(review_AUC)) 20 | print('review AP = {}'.format(review_AP)) -------------------------------------------------------------------------------- /UGFraud/Demo/eval_SpEagle.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Detector.SpEagle import * 2 | 3 | if __name__ == '__main__': 4 | # data source 5 | file_name = 'Yelp_graph_data.json' 6 | G = load_graph(file_name) 7 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 8 | 9 | # input parameters: numerical_eps, eps, num_iters, stop_threshold 10 | numerical_eps = 1e-5 11 | eps = 0.1 12 | user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]])) 13 | review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]])) 14 | potentials = {'u_r': user_review_potential, 'r_u': user_review_potential, 15 | 'r_p': review_product_potential, 'p_r': review_product_potential} 16 | max_iters = 4 17 | stop_threshold = 1e-3 18 | 19 | model = SpEagle(G, potentials, message=None, max_iters=4) 20 | 21 | # new runbp func 22 | model.schedule(schedule_type='bfs') 23 | 24 | iter = 0 25 | num_bp_iters = 2 26 | model.run_bp(start_iter=iter, max_iters=num_bp_iters, tol=stop_threshold) 27 | 28 | userBelief, reviewBelief, _ = model.classify() 29 | 30 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 31 | print('review AUC = {}'.format(review_AUC)) 32 | print('review AP = {}'.format(review_AP)) 33 | -------------------------------------------------------------------------------- /UGFraud/Demo/eval_ZooBP.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Utils.helper import * 2 | from UGFraud.Detector.ZooBP import * 3 | import sys 4 | import os 5 | sys.path.insert(0, os.path.abspath('../../')) 6 | 7 | 8 | if __name__ == '__main__': 9 | # data source 10 | file_name = 'Yelp_graph_data.json' 11 | G = load_graph(file_name) 12 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label') 13 | 14 | ep = 0.01 15 | # H: compatibility matrix 16 | H = np.array([[0.5, -0.5], [-0.5, 0.5]]) 17 | 18 | model = ZooBP(G, ep, H) 19 | userBelief, _ = model.run() # result = (user_beliefs, prod_beliefs) 20 | 21 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief) 22 | print('review AUC = {}'.format(review_AUC)) 23 | print('review AP = {}'.format(review_AP)) -------------------------------------------------------------------------------- /UGFraud/Demo/eval_fBox.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Utils.helper import * 2 | from UGFraud.Detector.fBox import * 3 | 4 | 5 | def runfBox(graph, t, k): 6 | user_priors = node_attr_filter(graph, 'types', 'user', 'prior') 7 | review_priors = edge_attr_filter(graph, 'types', 'review', 'prior') 8 | 9 | # run fBox 10 | model = fBox(graph) 11 | num_detected_users = [] 12 | 13 | detected_users_by_degree, detected_products_by_degree = model.run(t, k) 14 | detected_users = set() 15 | for d, user_list in detected_users_by_degree.items(): 16 | detected_users.update([u for u in user_list]) 17 | 18 | num_detected_users.append(len(detected_users)) 19 | 20 | detected_products = set() 21 | for d, prod_list in detected_products_by_degree.items(): 22 | detected_products.update([p for p in prod_list]) 23 | 24 | result_uid = [] 25 | user_prob = {} # result_prob means user_prob 26 | review_prob = {} 27 | for u, v in user_priors.items(): 28 | result_uid.append(u) 29 | if u in detected_users: 30 | user_prob.update({u: user_priors.get(u)}) 31 | else: 32 | user_prob.update({u: 1e-7}) 33 | 34 | for user_prod in graph.edges: 35 | if user_prod[0] in detected_users: 36 | review_prob[(user_prod[0], user_prod[1])] = review_priors.get((user_prod[0], user_prod[1])) 37 | else: 38 | review_prob[(user_prod[0], user_prod[1])] = 0 39 | 40 | return user_prob, review_prob 41 | 42 | 43 | if __name__ == '__main__': 44 | # data source 45 | file_name = 'Yelp_graph_data.json' 46 | G = load_graph(file_name) 47 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 48 | 49 | # important parameters 50 | t = 20 # taus = [0.5, 1, 5, 10, 25, 50, 99] 51 | k = 50 # k = range(10, 51, 10) 52 | 53 | userBelief, reviewBelief = runfBox(G, t, k) 54 | 55 | reviewBelief = scale_value(reviewBelief) 56 | 57 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 58 | print('review AUC = {}'.format(review_AUC)) 59 | print('review AP = {}'.format(review_AP)) 60 | -------------------------------------------------------------------------------- /UGFraud/Demo/testing.py: -------------------------------------------------------------------------------- 1 | from UGFraud.Demo.eval_fBox import * 2 | from UGFraud.Demo.eval_Fraudar import * 3 | from UGFraud.Demo.eval_GANG import * 4 | from UGFraud.Demo.eval_SpEagle import * 5 | from UGFraud.Demo.eval_SVD import * 6 | from UGFraud.Demo.eval_ZooBP import * 7 | from UGFraud.Demo.demo_pre import * 8 | import sys 9 | import os 10 | 11 | sys.path.insert(0, os.path.abspath('../../')) 12 | 13 | # data source 14 | file_name = 'Yelp_graph_data.json' 15 | try: 16 | G = load_graph(file_name) 17 | except FileNotFoundError: 18 | data_path = '../Yelp_Data/' 19 | data_to_network_graph(data_path) 20 | G = load_graph(file_name) 21 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label') 22 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 23 | 24 | """ 25 | testing fBox 26 | """ 27 | print("*" * 80) 28 | print("Testing fBox") 29 | t = 20 # taus = [0.5, 1, 5, 10, 25, 50, 99] 30 | k = 50 # k = range(10, 51, 10) 31 | serBelief, reviewBelief = runfBox(G, t, k) 32 | reviewBelief = scale_value(reviewBelief) 33 | 34 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 35 | print('review AUC = {}'.format(review_AUC)) 36 | print('review AP = {}'.format(review_AP)) 37 | 38 | """ 39 | testing Fraudar 40 | """ 41 | print("*" * 80) 42 | print("Testing Fraudar") 43 | userBelief, reviewBelief = runFraudar(G, multiple=0) 44 | reviewBelief = scale_value(reviewBelief) 45 | 46 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 47 | print('review AUC = {}'.format(review_AUC)) 48 | print('review AP = {}'.format(review_AP)) 49 | 50 | """ 51 | testing GANG 52 | """ 53 | print("*" * 80) 54 | print("Testing GANG") 55 | # add semi-supervised user information / threshold 56 | sup_per = 0.1 57 | 58 | # run GANG model 59 | model = GANG(G, user_ground_truth, sup_per, nor_flg=True, sup_flg=False) 60 | 61 | # run Linearized Belief Propagation on product-user matrix with 1000 iterations 62 | iteration = 1000 63 | model.pu_lbp(iteration) 64 | userBelief, _, reviewBelief = model.classify() 65 | reviewBelief = scale_value(reviewBelief) 66 | 67 | # evaluation 68 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 69 | print('review AUC = {}'.format(review_AUC)) 70 | print('review AP = {}'.format(review_AP)) 71 | 72 | """ 73 | testing Prior 74 | """ 75 | print("*" * 80) 76 | print("Testing Prior") 77 | # normalize the review prior as the review suspicious belief 78 | rpriors = edge_attr_filter(G, 'types', 'review', 'prior') 79 | reviewBelief = scale_value(rpriors) 80 | 81 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 82 | print('review AUC = {}'.format(review_AUC)) 83 | print('review AP = {}'.format(review_AP)) 84 | 85 | """ 86 | testing SpEagle 87 | """ 88 | print("*" * 80) 89 | print("Testing SpEagle") 90 | # input parameters: numerical_eps, eps, num_iters, stop_threshold 91 | numerical_eps = 1e-5 92 | eps = 0.1 93 | user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]])) 94 | review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]])) 95 | potentials = {'u_r': user_review_potential, 'r_u': user_review_potential, 96 | 'r_p': review_product_potential, 'p_r': review_product_potential} 97 | max_iters = 4 98 | stop_threshold = 1e-3 99 | 100 | model = SpEagle(G, potentials, message=None, max_iters=4) 101 | 102 | # new runbp func 103 | model.schedule(schedule_type='bfs') 104 | 105 | iter = 0 106 | num_bp_iters = 2 107 | model.run_bp(start_iter=iter, max_iters=num_bp_iters, tol=stop_threshold) 108 | 109 | userBelief, reviewBelief, _ = model.classify() 110 | 111 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 112 | print('review AUC = {}'.format(review_AUC)) 113 | print('review AP = {}'.format(review_AP)) 114 | 115 | """ 116 | testing SVG 117 | """ 118 | print("*" * 80) 119 | print("Testing SVD") 120 | percent = 0.9 121 | model = SVD(G) 122 | svd_output = model.run(percent) 123 | result = model.evaluate_SVD(svd_output, G) 124 | index = list(map(str, map(int, result[0]))) 125 | userBelief = dict(zip(index, result[1])) 126 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief) 127 | print('review AUC = {}'.format(review_AUC)) 128 | print('review AP = {}'.format(review_AP)) 129 | 130 | """ 131 | testing ZooBP 132 | """ 133 | print("*" * 80) 134 | print("Testing ZooBp") 135 | ep = 0.01 136 | # H: compatibility matrix 137 | H = np.array([[0.5, -0.5], [-0.5, 0.5]]) 138 | 139 | model = ZooBP(G, ep, H) 140 | userBelief, _ = model.run() # result = (user_beliefs, prod_beliefs) 141 | 142 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief) 143 | print('review AUC = {}'.format(review_AUC)) 144 | print('review AP = {}'.format(review_AP)) 145 | -------------------------------------------------------------------------------- /UGFraud/Detector/Fraudar.py: -------------------------------------------------------------------------------- 1 | """ 2 | contains functions that run the greedy detector for dense regions in a sparse matrix. 3 | use aveDegree or sqrtWeightedAveDegree or logWeightedAveDegree on a sparse matrix, 4 | which returns ((rowSet, colSet), score) for the most suspicious block. 5 | """ 6 | 7 | from __future__ import division 8 | from UGFraud.Detector.MinTree import MinTree 9 | from scipy import sparse 10 | import random 11 | import numpy as np 12 | 13 | 14 | # given a list of lists where each row is an edge, this returns the sparse matrix representation of the data. 15 | def listToSparseMatrix(edges_source, edges_des): 16 | m = max(edges_source) + 1 17 | n = max(edges_des) + 1 18 | M = sparse.coo_matrix(([1] * len(edges_source), (edges_source, edges_des)), shape=(m, n)) 19 | M1 = M > 0 20 | return M1.astype('int') 21 | 22 | 23 | # reads matrix from file and returns sparse matrix. first 2 columns should be row and column indices 24 | def readData(filename): 25 | edgesSource = [] 26 | edgesDest = [] 27 | with open(filename) as f: 28 | for line in f: 29 | toks = line.split() 30 | edgesSource.append(int(toks[0])) 31 | edgesDest.append(int(toks[1])) 32 | return listToSparseMatrix(edgesSource, edgesDest) 33 | 34 | 35 | def detectMultiple(M, detectFunc, numToDetect): 36 | Mcur = M.copy().tolil() 37 | res = [] 38 | for i in range(numToDetect): 39 | ((rowSet, colSet), score) = detectFunc(Mcur) 40 | res.append(((rowSet, colSet), score)) 41 | (rs, cs) = Mcur.nonzero() 42 | for i in range(len(rs)): 43 | if rs[i] in rowSet and cs[i] in colSet: 44 | Mcur[rs[i], cs[i]] = 0 45 | return res 46 | 47 | 48 | def detect_blocks(M, detectFunc): 49 | Mcur = M.copy().tolil() 50 | res = [] 51 | while True: 52 | ((rowSet, colSet), score) = detectFunc(Mcur) 53 | block = ((rowSet, colSet), score) 54 | if len(res) > 0: 55 | if abs(block[1] - res[-1][1]) < 0.01: 56 | break 57 | res.append(block) 58 | 59 | (rs, cs) = Mcur.nonzero() 60 | for i in range(len(rs)): 61 | if rs[i] in rowSet and cs[i] in colSet: 62 | Mcur[rs[i], cs[i]] = 0 63 | return res 64 | 65 | 66 | """ 67 | Inject a clique of size m0 by n0, with density pp. the last parameter testIdx determines the camouflage type. 68 | testIdx = 1: random camouflage, with camouflage density set so each fraudster outputs approximately 69 | equal number of fraudulent and camouflage edges 70 | testIdx = 2: random camouflage, with double the density as in the previous setting 71 | testIdx = 3: biased camouflage, more likely to add camouflage to high degree columns 72 | """ 73 | 74 | 75 | def injectCliqueCamo(M, m0, n0, p, testIdx): 76 | (m, n) = M.shape 77 | M2 = M.copy().tolil() 78 | 79 | colSum = np.squeeze(M2.sum(axis=0).A) 80 | colSumPart = colSum[n0:n] 81 | colSumPartPro = np.int_(colSumPart) 82 | colIdx = np.arange(n0, n, 1) 83 | population = np.repeat(colIdx, colSumPartPro, axis=0) 84 | 85 | for i in range(m0): 86 | # inject clique 87 | for j in range(n0): 88 | if random.random() < p: 89 | M2[i, j] = 1 90 | # inject camo 91 | if testIdx == 1: 92 | thres = p * n0 / (n - n0) 93 | for j in range(n0, n): 94 | if random.random() < thres: 95 | M2[i, j] = 1 96 | if testIdx == 2: 97 | thres = 2 * p * n0 / (n - n0) 98 | for j in range(n0, n): 99 | if random.random() < thres: 100 | M2[i, j] = 1 101 | # biased camo 102 | if testIdx == 3: 103 | colRplmt = random.sample(population, int(n0 * p)) 104 | M2[i, colRplmt] = 1 105 | 106 | return M2.tocsc() 107 | 108 | 109 | # sum of weighted edges in rowSet and colSet in matrix M 110 | def c2Score(M, rowSet, colSet): 111 | return M[list(rowSet), :][:, list(colSet)].sum(axis=None) 112 | 113 | 114 | def jaccard(pred, actual): 115 | intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1])) 116 | unionSize = len(set.union(pred[0], actual[0])) + len(set.union(pred[1], actual[1])) 117 | return intersectSize / unionSize 118 | 119 | 120 | def getPrecision(pred, actual): 121 | intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1])) 122 | return intersectSize / (len(pred[0]) + len(pred[1])) 123 | 124 | 125 | def getRecall(pred, actual): 126 | intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1])) 127 | return intersectSize / (len(actual[0]) + len(actual[1])) 128 | 129 | 130 | def getFMeasure(pred, actual): 131 | prec = getPrecision(pred, actual) 132 | rec = getRecall(pred, actual) 133 | return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec)) 134 | 135 | 136 | def getRowPrecision(pred, actual, idx): 137 | intersectSize = len(set.intersection(pred[idx], actual[idx])) 138 | return intersectSize / len(pred[idx]) 139 | 140 | 141 | def getRowRecall(pred, actual, idx): 142 | intersectSize = len(set.intersection(pred[idx], actual[idx])) 143 | return intersectSize / len(actual[idx]) 144 | 145 | 146 | def getRowFMeasure(pred, actual, idx): 147 | prec = getRowPrecision(pred, actual, idx) 148 | rec = getRowRecall(pred, actual, idx) 149 | return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec)) 150 | 151 | 152 | # run greedy algorithm using square root column weights 153 | def sqrtWeightedAveDegree(M): 154 | (m, n) = M.shape 155 | colSums = M.sum(axis=0) 156 | colWeights = 1.0 / np.sqrt(np.squeeze(colSums) + 5) 157 | colDiag = sparse.lil_matrix((n, n)) 158 | colDiag.setdiag(colWeights) 159 | W = M * colDiag 160 | return fastGreedyDecreasing(W, colWeights) 161 | 162 | 163 | # run greedy algorithm using logarithmic weights 164 | def logWeightedAveDegree(M): 165 | (m, n) = M.shape 166 | colSums = M.sum(axis=0) 167 | colWeights = np.squeeze(np.array(1.0 / np.log(np.squeeze(colSums) + 5))) 168 | colDiag = sparse.lil_matrix((n, n)) 169 | colDiag.setdiag(colWeights) 170 | W = M * colDiag 171 | # print('finished computing weight matrix') 172 | return fastGreedyDecreasing(W, colWeights) 173 | 174 | 175 | def aveDegree(M): 176 | (m, n) = M.shape 177 | return fastGreedyDecreasing(M, [1] * n) 178 | 179 | 180 | def subsetAboveDegree(M, col_thres, row_thres): 181 | M = M.tocsc() 182 | (m, n) = M.shape 183 | colSums = np.squeeze(np.array(M.sum(axis=0))) 184 | rowSums = np.squeeze(np.array(M.sum(axis=1))) 185 | colValid = colSums > col_thres 186 | rowValid = rowSums > row_thres 187 | M1 = M[:, colValid].tocsr() 188 | M2 = M1[rowValid, :] 189 | rowFilter = [i for i in range(m) if rowValid[i]] 190 | colFilter = [i for i in range(n) if colValid[i]] 191 | return M2, rowFilter, colFilter 192 | 193 | 194 | # @profile 195 | def fastGreedyDecreasing(M, colWeights): 196 | (m, n) = M.shape 197 | Md = M.todok() 198 | Ml = M.tolil() 199 | Mlt = M.transpose().tolil() 200 | rowSet = set(range(0, m)) 201 | colSet = set(range(0, n)) 202 | curScore = c2Score(M, rowSet, colSet) 203 | bestAveScore = curScore / (len(rowSet) + len(colSet)) 204 | bestSets = (rowSet, colSet) 205 | rowDeltas = np.squeeze(M.sum(axis=1).A) # *decrease* in total weight when *removing* this row 206 | colDeltas = np.squeeze(M.sum(axis=0).A) 207 | rowTree = MinTree(rowDeltas) 208 | colTree = MinTree(colDeltas) 209 | 210 | numDeleted = 0 211 | deleted = [] 212 | bestNumDeleted = 0 213 | 214 | while rowSet and colSet: 215 | nextRow, rowDelt = rowTree.getMin() 216 | nextCol, colDelt = colTree.getMin() 217 | if rowDelt <= colDelt: 218 | curScore -= rowDelt 219 | for j in Ml.rows[nextRow]: 220 | delt = colWeights[j] 221 | colTree.changeVal(j, -colWeights[j]) 222 | rowSet -= {nextRow} 223 | rowTree.changeVal(nextRow, float('inf')) 224 | deleted.append((0, nextRow)) 225 | else: 226 | curScore -= colDelt 227 | for i in Mlt.rows[nextCol]: 228 | delt = colWeights[nextCol] 229 | rowTree.changeVal(i, -colWeights[nextCol]) 230 | colSet -= {nextCol} 231 | colTree.changeVal(nextCol, float('inf')) 232 | deleted.append((1, nextCol)) 233 | 234 | numDeleted += 1 235 | curAveScore = curScore / (len(colSet) + len(rowSet)) 236 | 237 | if curAveScore > bestAveScore: 238 | bestAveScore = curAveScore 239 | bestNumDeleted = numDeleted 240 | 241 | # reconstruct the best row and column sets 242 | finalRowSet = set(range(m)) 243 | finalColSet = set(range(n)) 244 | for i in range(bestNumDeleted): 245 | if deleted[i][0] == 0: 246 | finalRowSet.remove(deleted[i][1]) 247 | else: 248 | finalColSet.remove(deleted[i][1]) 249 | return ((finalRowSet, finalColSet), bestAveScore) 250 | -------------------------------------------------------------------------------- /UGFraud/Detector/GANG.py: -------------------------------------------------------------------------------- 1 | """ 2 | 'GANG: Detecting Fraudulent Users in Online Social Networks via Guilt-by-Association on Directed Graphs' 3 | A guilt-by-association method on directed graphs, to detect fraudulent users in OSNs. 4 | Article: http://people.duke.edu/~zg70/papers/GANG.pdf 5 | """ 6 | 7 | from scipy.sparse import lil_matrix 8 | from UGFraud.Utils.helper import * 9 | import random 10 | 11 | 12 | def semi_data(ground_truth, portion): 13 | """ 14 | produce the sampled labeled review id used for semi-supervised prior 15 | :param ground_truth: dict of ground truth {uid:label} or {rid:label} 16 | :param portion: portion of the labeled data 17 | :return: review id which are used for supervising 18 | """ 19 | 20 | smaple_size = int(len(ground_truth) * portion * 0.5) 21 | total_list = [r for r in ground_truth.keys()] 22 | pos_list = [] 23 | neg_list = [] 24 | for id, label in ground_truth.items(): 25 | if label == 1: 26 | pos_list.append(id) 27 | else: 28 | neg_list.append(id) 29 | 30 | pos_sample = [pos_list[i] for i in sorted(random.sample(range(len(pos_list)), smaple_size))] 31 | neg_sample = [neg_list[i] for i in sorted(random.sample(range(len(neg_list)), smaple_size))] 32 | 33 | pos_ids = [total_list.index(s) for s in pos_sample] 34 | neg_ids = [total_list.index(s) for s in neg_sample] 35 | 36 | return pos_ids, neg_ids 37 | 38 | 39 | class GANG: 40 | 41 | def __init__(self, graph, user_ground_truth, sup_per, nor_flg, sup_flg=False): 42 | 43 | # number of dimensions of product-user matrix 44 | u_prior = node_attr_filter(graph, 'types', 'user', 'prior') 45 | p_prior = node_attr_filter(graph, 'types', 'prod', 'prior') 46 | r_prior = edge_attr_filter(graph, 'types', 'review', 'prior') 47 | priors = [u_prior, r_prior, p_prior] 48 | self.pu_dim = len(priors[0])+len(priors[2]) 49 | # spam belief prior vector 50 | self.res_pu_spam_prior_vector = None 51 | # diagonal matrix used for normalization 52 | self.diag_pu_matrix = None 53 | # product-user spam posterior belief vector 54 | self.res_pu_spam_post_vector = np.zeros((self.pu_dim, 1)) 55 | # sparse row matrix is faster when multiply with vectors 56 | self.pu_csr_matrix = None 57 | self.diag_pu_csr_matrix = None 58 | self.nor_pu_csr_matrix = None 59 | # priors dictionary 60 | self.u_priors = priors[0] 61 | self.r_priors = priors[1] 62 | self.p_priors = priors[2] 63 | # build prior belief vector 64 | p_vector, u_vector, r_vector = [], [], [] 65 | if nor_flg: 66 | # the mean value with normalization 67 | u_mean, p_mean, r_mean = 0.5, 0.5, 0.5 68 | else: 69 | # the mean value without normalization 70 | priors, mean_priors = nor_priors(priors) 71 | u_mean, r_mean, p_mean = mean_priors[0], mean_priors[1], mean_priors[2] 72 | 73 | for u in priors[0].values(): 74 | u_vector.append(u) 75 | for p in priors[2].values(): 76 | p_vector.append(p) 77 | 78 | res_u_vector = [i-u_mean for i in u_vector] 79 | res_p_vector = [i-p_mean for i in p_vector] 80 | 81 | # add semi-supervised user information 82 | if sup_flg: 83 | pos_ids, neg_ids = semi_data(user_ground_truth, sup_per) 84 | for iter, prob in enumerate(res_u_vector): 85 | if iter in pos_ids: 86 | res_u_vector[iter] = 1 - u_mean 87 | elif iter in neg_ids: 88 | res_u_vector[iter] = 0 - u_mean 89 | 90 | # aggregate the prior vectors 91 | res_pu_vector = res_p_vector + res_u_vector 92 | 93 | self.res_pu_spam_prior_vector = np.c_[res_pu_vector] 94 | 95 | # build product-user adjacency sparse matrix 96 | self.pu_matrix = lil_matrix((self.pu_dim, self.pu_dim)) 97 | 98 | # create the pu diagonal matrix 99 | self.diag_pu_matrix = lil_matrix((self.pu_dim, self.pu_dim)) 100 | for id in range(0, self.pu_dim): 101 | if id < len(self.p_priors): 102 | self.diag_pu_matrix[id, id] = len(graph[str(id)]) 103 | else: 104 | self.diag_pu_matrix[id, id] = len(graph[str(id)]) 105 | 106 | for p_id in p_prior.keys(): 107 | for neighbor_id in graph[p_id].keys(): 108 | self.pu_matrix[int(p_id), int(neighbor_id)] = 1 109 | 110 | for u_id in u_prior.keys(): 111 | for neighbor_id in graph[u_id].keys(): 112 | self.pu_matrix[int(u_id), int(neighbor_id)] = 1 113 | 114 | @timer 115 | def pu_lbp(self, max_iters): 116 | """ 117 | Run the matrix form of lbp on the product-user sparse matrix 118 | :return: the posterior belief vector of products and users 119 | """ 120 | 121 | # transfer to sparse row matrix to accelerate calculation 122 | self.pu_csr_matrix = self.pu_matrix.tocsr() 123 | self.diag_pu_csr_matrix = self.diag_pu_matrix.tocsr() 124 | 125 | i = 0 126 | while i < max_iters: 127 | sum_0 = np.sum(self.res_pu_spam_post_vector) 128 | self.res_pu_spam_post_vector = self.res_pu_spam_prior_vector + 2 * 0.008 * (self.pu_csr_matrix.dot(self.res_pu_spam_post_vector)) 129 | sum_1 = np.sum(self.res_pu_spam_post_vector) 130 | 131 | # print('iter: ' + str(i)) 132 | # print('diff: ' + str(abs(sum_0 - sum_1))) 133 | 134 | i += 1 135 | 136 | if abs(sum_0 - sum_1) < 0.1: 137 | return abs(sum_0 - sum_1) 138 | 139 | @timer 140 | def classify(self): 141 | """ 142 | Calculate the posterior belief of three type of nodes 143 | :return: u_post: users posterior beliefs, p_post: products posterior beliefs, 144 | r_post: reviews posterior beliefs. 145 | """ 146 | u_post = {} 147 | p_post = {} 148 | r_post = {} 149 | pu_post = self.res_pu_spam_post_vector 150 | no_prod = len(self.p_priors) 151 | # extract the posterior belief of users and reviews 152 | for i, r in enumerate(pu_post[no_prod:]): 153 | u_post[str(i + no_prod)] = float(r) 154 | for i, r in enumerate(pu_post[:no_prod]): 155 | p_post[str(i)] = float(r) 156 | for i, r in self.r_priors.items(): 157 | r_post[i] = (u_post[i[0]] + float(r)) / 2 158 | 159 | u_post = scale_value(u_post) 160 | p_post = scale_value(p_post) 161 | r_post = scale_value(r_post) 162 | 163 | return u_post, p_post, r_post -------------------------------------------------------------------------------- /UGFraud/Detector/MinTree.py: -------------------------------------------------------------------------------- 1 | """ 2 | A tree data structure which stores a list of degrees and can quickly retrieve the min degree element, 3 | or modify any of the degrees, each in logarithmic time. It works by creating a binary tree with the 4 | given elements in the leaves, where each internal node stores the min of its two children. 5 | """ 6 | 7 | import math 8 | 9 | 10 | class MinTree: 11 | def __init__(self, degrees): 12 | self.input_length = len(degrees) 13 | self.height = int(math.ceil(math.log(len(degrees), 2))) 14 | self.numLeaves = 2 ** self.height 15 | self.numBranches = self.numLeaves - 1 16 | self.n = self.numBranches + self.numLeaves 17 | self.nodes = [float('inf')] * self.n 18 | for i in range(len(degrees)): 19 | self.nodes[self.numBranches + i] = degrees[i] 20 | for i in reversed(range(self.numBranches)): 21 | self.nodes[i] = min(self.nodes[2 * i + 1], self.nodes[2 * i + 2]) 22 | 23 | def getMin(self): 24 | cur = 0 25 | for i in range(self.height): 26 | cur = (2 * cur + 1) if self.nodes[2 * cur + 1] <= self.nodes[2 * cur + 2] else (2 * cur + 2) 27 | # print "found min at %d: %d" % (cur, self.nodes[cur]) 28 | return (cur - self.numBranches, self.nodes[cur]) 29 | 30 | def changeVal(self, idx, delta): 31 | cur = self.numBranches + idx 32 | self.nodes[cur] += delta 33 | for i in range(self.height): 34 | cur = (cur - 1) // 2 35 | nextParent = min(self.nodes[2 * cur + 1], self.nodes[2 * cur + 2]) 36 | if self.nodes[cur] == nextParent: 37 | break 38 | self.nodes[cur] = nextParent 39 | 40 | def dump(self): 41 | print ("numLeaves: %d, numBranches: %d, n: %d, nodes: " % (self.numLeaves, self.numBranches, self.n)) 42 | cur = 0 43 | for i in range(self.height + 1): 44 | for j in range(2 ** i): 45 | print (self.nodes[cur]) 46 | cur += 1 47 | print ('') 48 | 49 | def print_leaves(self): 50 | for i in range(self.input_length): 51 | print (self.nodes[self.numBranches + i]) 52 | -------------------------------------------------------------------------------- /UGFraud/Detector/SVD.py: -------------------------------------------------------------------------------- 1 | """ 2 | 'Singular Value Decomposition and Least Squares Solutions' 3 | The Singular-Value Decomposition, or SVD for short, is a matrix decomposition method for reducing 4 | a matrix to its constituent parts in order to make certain subsequent matrix calculations simpler. 5 | Article: https://link.springer.com/content/pdf/10.1007/978-3-662-39778-7_10.pdf 6 | """ 7 | 8 | from UGFraud.Utils.helper import * 9 | from sklearn import svm 10 | from sklearn.svm import SVC 11 | from scipy.sparse.linalg import svds 12 | import numpy as np 13 | 14 | 15 | class SVD: 16 | def __init__(self, graph): 17 | """set up the data 18 | Args: 19 | graph: a networkx graph 20 | """ 21 | user_priors = node_attr_filter(graph, 'types', 'user', 'prior') 22 | prod_priors = node_attr_filter(graph, 'types', 'prod', 'prior') 23 | num_users = len(user_priors) 24 | num_products = len(prod_priors) 25 | self.user_prod_matrix = np.empty(shape=(num_users, num_products)) 26 | 27 | # create a dict for user_index in the user list and a dict for prod_index in the product list 28 | self.user_index = dict() 29 | self.prod_index = dict() 30 | 31 | i = 0 32 | for u_id in user_priors.keys(): 33 | self.user_index[u_id] = i 34 | i = i + 1 35 | 36 | j = 0 37 | for prod_id in prod_priors.keys(): 38 | self.prod_index[prod_id] = j 39 | j = j + 1 40 | # 41 | for user_id in user_priors.keys(): 42 | for p_id in graph[user_id].keys(): 43 | rating = graph.edges.get((user_id, p_id))['rating'] 44 | row = self.user_index[user_id] 45 | column = self.prod_index[p_id] 46 | self.user_prod_matrix[row, column] = rating 47 | 48 | @timer 49 | def run(self, percent): 50 | """ 51 | perform SVD and return the user-product matrix in a lower dimensional space 52 | """ 53 | k = int(max(np.round(min(self.user_prod_matrix.shape) * percent), 1)) 54 | u, s, v = svds(self.user_prod_matrix, k=k) 55 | return u 56 | 57 | def random_split(self, graph): 58 | """ 59 | Partition user nodes into training and test set randomly. 60 | Args: 61 | user_product_graph: a dictionary, with key = user_id, value = (p_id, rating, label, time) 62 | Return: 63 | training_user_id: a set of user id to appear in model training 64 | """ 65 | pos = set() 66 | node_degree = {} 67 | user_dict = node_attr_filter(graph, 'types', 'user', 'types') 68 | for u_id in user_dict.keys(): 69 | for p_id in graph[u_id].keys(): 70 | if graph.edges.get((u_id, p_id))['label'] == 0: 71 | pos.add(u_id) 72 | break 73 | node_degree[u_id] = len(graph[u_id]) 74 | 75 | neg = set(list(user_dict.keys())) - pos 76 | 77 | # random sample positive users 78 | training_pos = set(np.random.choice(list(pos), int(0.5 * len(pos))).ravel()) 79 | training_neg = set(np.random.choice(list(neg), int(0.5 * len(neg))).ravel()) 80 | 81 | test_pos = pos - training_pos 82 | test_neg = neg - training_neg 83 | 84 | print("number of positive %d" % len(pos)) 85 | print("number of negative %d" % len(neg)) 86 | print("number of all users %d" % len(user_dict)) 87 | 88 | return training_pos, training_neg, test_pos, test_neg 89 | 90 | def classify(self, training_data_svm, training_labels_svm, testing_data_svm, testing_labels_svm): 91 | clf = svm.SVC(probability=True) 92 | clf.fit(training_data_svm, training_labels_svm) 93 | SVC(C=100, tol=0.00001) 94 | predictions = clf.predict_proba(testing_data_svm) 95 | return predictions 96 | 97 | def classify_binary(self, training_data_svm, training_labels_svm, testing_data_svm, testing_labels_svm): 98 | clf = svm.SVC() 99 | clf.fit(training_data_svm, training_labels_svm) 100 | SVC(C=100, tol=0.00001) 101 | predictions = clf.predict(testing_data_svm) 102 | return predictions 103 | 104 | def evaluate_SVD(self, svd_output, graph): 105 | # random_split 106 | training_pos, training_neg, test_pos, test_neg = self.random_split(graph) 107 | training_labels = {i: +1 for i in training_pos} 108 | training_labels.update({i: -1 for i in training_neg}) 109 | 110 | test_labels = {i: +1 for i in test_pos} 111 | test_labels.update({i: -1 for i in test_neg}) 112 | 113 | training_data_svm = np.empty(shape=(len(training_labels), len(svd_output[1, :]))) 114 | training_labels_svm = np.empty(shape=(len(training_labels))) 115 | # build training data and labels for svm 116 | i = 0 117 | find_training_uid = dict() 118 | for k, v in training_labels.items(): 119 | u_index = self.user_index[k] 120 | training_data_svm[i, :] = svd_output[u_index, :] 121 | training_labels_svm[i] = v 122 | find_training_uid[i] = k 123 | i = i + 1 124 | # build testing data and labels for svm 125 | testing_data_svm = np.empty(shape=(len(test_labels), len(svd_output[1, :]))) 126 | testing_labels_svm = np.empty(shape=(len(test_labels))) 127 | j = 0 128 | find_testing_uid = np.empty(shape=(len(test_labels))) 129 | for k, v in test_labels.items(): 130 | u_index = self.user_index[k] 131 | testing_data_svm[j, :] = svd_output[u_index, :] 132 | testing_labels_svm[j] = v 133 | find_testing_uid[j] = k 134 | j = j + 1 135 | 136 | probas_pred = self.classify(training_data_svm, training_labels_svm, testing_data_svm, testing_labels_svm) 137 | result = [find_testing_uid, probas_pred[:, 0]] 138 | return result -------------------------------------------------------------------------------- /UGFraud/Detector/SpEagle.py: -------------------------------------------------------------------------------- 1 | """ 2 | 'Collective Opinion Spam Detection: Bridging Review Networks and Metadata' 3 | Utilizing clues from all metadata (text, timestamp, rating) as well as relational data (network), 4 | and harness them collectively under a unified framework to spot suspicious users and reviews, 5 | as well as products targeted by spam. 6 | Article: https://www.andrew.cmu.edu/user/lakoglu/pubs/15-kdd-collectiveopinionspam.pdf 7 | """ 8 | 9 | from UGFraud.Utils.helper import * 10 | from heapq import * 11 | from scipy.special import logsumexp 12 | import pickle 13 | 14 | 15 | class myTuple(): 16 | def __init__(self, cost, node_id): 17 | self._cost = cost 18 | self._id = node_id 19 | 20 | def __lt__(self, other): 21 | return self._cost < other._cost 22 | 23 | 24 | class Node(object): 25 | """ a Node object represents a node on the graph (which is also a random variable). 26 | 27 | Attributes: 28 | _name: node's ID. a string 29 | _type: a string denoting the type of the node (User, Review, Product) 30 | _prior: the node's prior distribution (\phi) 31 | _num_classes: number of classes, which is also the length of the prior vector. 32 | _outgoing: a dictionary of out-going messages to its neighbors (key: j, value: m_{i\to j}) 33 | where i is the current node and j is the target node. 34 | _neighbors: a list of references to its neighbors 35 | """ 36 | 37 | def __init__(self, name, prior, node_type): 38 | """ Create the attributes 39 | Args: 40 | name: a string id of this node. 41 | prior: a floating number between [0,1] representing P(y=1 | node) 42 | node_type: 'u', 'p' or 'r' 43 | """ 44 | # to prevent log 0 45 | self._eps = 1e-5 46 | 47 | # node id (such as a u_id, p_id or review_id) 48 | self._name = name 49 | 50 | # list of names (such as u_id, p_id, and review_id) of the neighboring nodes 51 | self._neighbors = [] 52 | 53 | # a dictionary with key = neighboring node id, value = np.array() representing the message 54 | # from this node to the neighbor 55 | self._outgoing = {} 56 | 57 | # prior in log space, with check on 0's 58 | if prior == 1: 59 | prior = 1 - self._eps 60 | elif prior == 0: 61 | prior = self._eps 62 | 63 | self._prior = np.log( 64 | np.array([1 - prior, prior])) # previous version: self._prior = np.log(np.array([1-prior, prior])) 65 | 66 | self._num_classes = 2 67 | 68 | self._type = node_type 69 | 70 | # if self._type == 'p' and self._name == 'p0': 71 | # print ('product %s initialized.' % self._name) 72 | # print (self._outgoing) 73 | 74 | def add_neighbor(self, neighbor_node_id): 75 | """ 76 | add a neighboring node to this node; create out-going message from this node to the neighbor 77 | Args: 78 | neighbor_node_id: a string representing the neighbor's id 79 | """ 80 | self._neighbors.append(neighbor_node_id) 81 | self._outgoing[neighbor_node_id] = np.zeros(self._num_classes) 82 | 83 | def add_local_neighbor(self, neighbor_node_id, message): 84 | """ 85 | add a neighboring node to this node to build local graph; copy out-going message from the global graph 86 | Args: 87 | neighbor_node_id: a string representing the neighbor's id 88 | message: the message from this node to its neighbor 89 | """ 90 | # find the message to the corresponding neighbor_node_id 91 | 92 | for m in message: 93 | if neighbor_node_id in m.keys(): 94 | message_to_neighbor = m[neighbor_node_id] 95 | break 96 | 97 | self._neighbors.append(neighbor_node_id) 98 | self._outgoing[neighbor_node_id] = message_to_neighbor 99 | 100 | def init_outgoing(self): 101 | """ 102 | Initialize all messages to 0. 103 | """ 104 | # a dictionary: key = neighbor id, value = np.ndarray of uniform distributions 105 | #self._outgoing = {n: np.zeros(self._num_classes) for n in self._neighbors} 106 | for n in self._neighbors: 107 | self._outgoing[n].fill(0.0) 108 | 109 | # if self._type == 'p' and self._name == 'p0': 110 | # print (self._outgoing) 111 | 112 | def n_edges(self): 113 | return len(self._neighbors) 114 | 115 | def get_name(self): 116 | return self._name 117 | 118 | def get_type(self): 119 | return self._type 120 | 121 | def get_prior(self): 122 | """" return the prior of the node in prob space """ 123 | return np.exp(self._prior) 124 | 125 | def get_neighbors(self): 126 | return self._neighbors 127 | 128 | def get_outgoing(self): 129 | return self._outgoing 130 | 131 | def get_message_for(self, neighbor_name): 132 | """ find the message sent from this node to the neighbor specified by neighbor_name """ 133 | 134 | # note that _outgoing is a dictionary with key = neighbor id and value = messages 135 | # print(neighbor_name) 136 | assert neighbor_name in self._outgoing, "the neighbor %s is not a neighbor of the node %s\n" % ( 137 | neighbor_name, self._name) 138 | 139 | return self._outgoing[neighbor_name] 140 | 141 | def get_belief(self, all_nodes): 142 | """ return the belief of the node, along with the messages used to compute the belief 143 | Args: 144 | all_nodes: a dictionary containing all nodes on the graph 145 | Return: 146 | belief: 147 | incoming: 148 | """ 149 | 150 | incoming = [] 151 | 152 | # log 1 = 0 153 | belief = np.zeros(self._num_classes) 154 | 155 | # add log of phi 156 | belief += self._prior 157 | 158 | # go through each neighbor of the node 159 | for node_id in self._neighbors: 160 | # get the message sent from the neighbor n to the current node (self._name) 161 | 162 | # look up the neighboring node in all_nodes 163 | n = all_nodes[node_id] 164 | 165 | # getting message from the neighboring node to this node 166 | # consider working in the log scale to prevent underflowing 167 | 168 | # sum log m_ij 169 | belief += n.get_message_for(self._name) 170 | 171 | # in the same order as self._neighbors 172 | incoming.append(n.get_message_for(self._name)) 173 | # print (n.get_message_for(self._name)) 174 | 175 | return belief, incoming 176 | 177 | def recompute_outgoing(self, potentials, all_nodes, normalize=True): 178 | """ for each neighbor j, update the message sent to j 179 | 180 | Args: 181 | potentials: a dictionary (key = edge type, value = log of potential matrix). 182 | An edge type is src_type + "_" + dst_type 183 | all_nodes: same as that in get_belief 184 | 185 | Return: 186 | difference between previous and updated messages. 187 | """ 188 | # return value 189 | diff = 0 190 | 191 | # the messages in incoming is in the same order of self._neighbors 192 | # total = log phi_i + sum_{j~i} log m_ji 193 | # incoming = [log m_ji] 194 | total, incoming = self.get_belief(all_nodes) 195 | 196 | # go through each neighbor of the node 197 | for j, n_id in enumerate(self._neighbors): 198 | 199 | n = all_nodes[n_id] 200 | 201 | log_m_i = total - incoming[j] 202 | 203 | # note that the potential matrix depends on the edge type (write(user, review) or belong(review, product)) 204 | # edge_type can be (user-review), (review-product), (review-user) and (product-review) 205 | edge_type = self._type + '_' + n._type 206 | 207 | # log H, where H is symmetric and there is no need to transpose it 208 | log_H = potentials[edge_type] 209 | 210 | log_m_ij = logsumexp(log_H + np.tile(log_m_i.transpose(), (2, 1)), axis=1) 211 | 212 | # normalize the message 213 | log_Z = logsumexp(log_H + np.tile(log_m_i.transpose(), (2, 1))) 214 | 215 | log_m_ij -= log_Z# 216 | 217 | # accumulate the difference 218 | diff += np.sum(np.abs(self._outgoing[n._name] - log_m_ij)) 219 | 220 | # set the message from i to j 221 | self._outgoing[n._name] = log_m_ij 222 | return diff 223 | 224 | 225 | class SpEagle: 226 | def __init__(self, graph, potentials, message=None, max_iters=1): 227 | """ set up the data and parameters. 228 | 229 | Args: 230 | graph: a networkx graph 231 | 232 | potentials: a dictionary (key = edge_type, value=np.ndarray) 233 | """ 234 | 235 | self._potentials = potentials 236 | self._max_iters = max_iters 237 | self._message = message 238 | 239 | self._user_priors = node_attr_filter(graph, 'types', 'user', 'prior') 240 | self._product_priors = node_attr_filter(graph, 'types', 'prod', 'prior') 241 | self._review_priors = edge_attr_filter(graph, 'types', 'review', 'prior') 242 | 243 | # create nodes on the graph. key = u_id / p_id / review_id, value = node 244 | self._nodes = {} 245 | 246 | self._bp_schedule = [] 247 | 248 | # add nodes and edges to build the graph 249 | for u_id in self._user_priors.keys(): 250 | unique_u_id = 'u' + u_id 251 | 252 | # prior in log scale 253 | self._nodes[unique_u_id] = Node(unique_u_id, self._user_priors[u_id], 'u') 254 | 255 | # go through the reviews posted by the user 256 | for p_id in graph[u_id].keys(): 257 | unique_p_id = 'p' + p_id 258 | 259 | if unique_p_id not in self._nodes: 260 | self._nodes[unique_p_id] = Node(unique_p_id, self._product_priors[p_id], 'p') 261 | 262 | review_id = (u_id, p_id) 263 | unique_review_id = (unique_u_id, unique_p_id) 264 | 265 | if unique_review_id not in self._nodes: 266 | review_node = Node(unique_review_id, self._review_priors[review_id], 'r') 267 | 268 | # add connections and out-going messages if the graph is a global graph 269 | if self._message is None: 270 | review_node.add_neighbor(unique_u_id) 271 | review_node.add_neighbor(unique_p_id) 272 | self._nodes[unique_u_id].add_neighbor(unique_review_id) 273 | self._nodes[unique_p_id].add_neighbor(unique_review_id) 274 | else: 275 | # add connections and out-going messages if the graph is a local graph 276 | review_node.add_local_neighbor(unique_u_id, message[unique_review_id]) 277 | review_node.add_local_neighbor(unique_p_id, message[unique_review_id]) 278 | self._nodes[unique_u_id].add_local_neighbor(unique_review_id, message[unique_u_id]) 279 | self._nodes[unique_p_id].add_local_neighbor(unique_review_id, message[unique_p_id]) 280 | 281 | self._nodes[unique_review_id] = review_node 282 | 283 | def add_new_data(self, new_user_product_graph, new_priors): 284 | """ 285 | Add new a new users-review-products sub-graph to the global existing graph. 286 | Need to be very careful as we don't want to mess up with existing structures and information 287 | 288 | :param new_user_product_graph: same format as the user_product_graph argument in __init__ 289 | :param new_priors: same format as the priors argument in __init__ 290 | :return: None 291 | """ 292 | new_u_priors = new_priors[0] 293 | new_p_priors = new_priors[2] 294 | new_r_priors = new_priors[1] 295 | 296 | for u_id, reviews in new_user_product_graph.items(): 297 | unique_u_id = 'u' + u_id 298 | if unique_u_id not in self._nodes: 299 | self._user_priors[u_id] = new_u_priors[u_id] 300 | self._nodes[unique_u_id] = Node(unique_u_id, self._user_priors[u_id], 'u') 301 | 302 | # go through the reviews posted by the user 303 | for t in reviews: 304 | p_id = t[0] 305 | unique_p_id = 'p' + p_id 306 | 307 | if unique_p_id not in self._nodes: 308 | self._product_priors[p_id] = new_p_priors[p_id] 309 | self._nodes[unique_p_id] = Node(unique_p_id, self._product_priors[p_id], 'p') 310 | 311 | review_id = (u_id, p_id) 312 | unique_review_id = (unique_u_id, unique_p_id) 313 | 314 | if unique_review_id not in self._nodes: 315 | self._review_priors[review_id] = new_r_priors[review_id] 316 | review_node = Node(unique_review_id, self._review_priors[review_id], 'r') 317 | 318 | # add connections and out-going messages 319 | review_node.add_neighbor(unique_u_id) 320 | review_node.add_neighbor(unique_p_id) 321 | self._nodes[unique_u_id].add_neighbor(unique_review_id) 322 | self._nodes[unique_p_id].add_neighbor(unique_review_id) 323 | self._nodes[unique_review_id] = review_node 324 | 325 | def safe_log(self, array, eps=1e-5): 326 | """ element-wise log the given array with smoothing worrying zeros 327 | """ 328 | return np.log((array + eps) / np.sum(array + eps)) 329 | 330 | def output_graph(self): 331 | """ 332 | output nodes, edges, priors and potentials 333 | """ 334 | for n in self._nodes.values(): 335 | print(str(n.get_name()) + ": " + n.get_type()) 336 | print(n.get_prior()) 337 | print(n.get_neighbors()) 338 | 339 | def schedule(self, schedule_type='bfs'): 340 | """ use breadth-first-search to create a BP schedule 341 | :param: 342 | schedule_type: 'bfs' or 'degree' 343 | :return: 344 | """ 345 | 346 | # sort nodes in descending order of their degrees 347 | items = [(n.get_name(), n.n_edges()) for k, n in self._nodes.items()] 348 | items = sorted(items, key=lambda x: x[1], reverse=True) 349 | 350 | if schedule_type == 'degree': 351 | self._bp_schedule = [name for name, _ in items] 352 | return 353 | 354 | mark = set(self._nodes.keys()) 355 | self._bp_schedule = [] 356 | 357 | head = 0 358 | tail = -1 359 | 360 | # uncomment this for loop to get bfs + degree 361 | for node_id, _ in items: 362 | # uncomment this for loop to get regular bfs 363 | # for node_id, _ in self._nodes.items(): 364 | node = self._nodes[node_id] 365 | # newly-found connected component 366 | if node_id in mark: 367 | tail += 1 368 | self._bp_schedule.append(node_id) 369 | mark.remove(node_id) 370 | 371 | # search starting from i 372 | while head <= tail: 373 | cur_node = self._nodes[self._bp_schedule[head]] 374 | head += 1 375 | for neighbor_id in cur_node._neighbors: 376 | if neighbor_id in mark: 377 | tail += 1 378 | self._bp_schedule.append(neighbor_id) 379 | mark.remove(neighbor_id) 380 | 381 | def local_schedule(self, starting_nodes, num_hops): 382 | """ 383 | Use Dijkstra to find nodes that are num_hops away from the starting nodes 384 | :param starting_nodes: the nodes considered the "source" 385 | :param num_hops: how far away to go 386 | :return: 387 | """ 388 | # node searched so far 389 | seen = set() 390 | # minimum distance of each seen node to the source nodes 391 | min_costs = {} 392 | # a priority queue of (cost, node_id) 393 | q = [] 394 | 395 | # initialize the queue to contain the starting nodes 396 | for node_id in starting_nodes: 397 | q.append(myTuple(0, node_id)) 398 | min_costs[node_id] = 0 399 | 400 | heapify(q) 401 | self._bp_schedule = [] 402 | while q: 403 | tuple = heappop(q) 404 | v = tuple._id 405 | cost = tuple._cost 406 | 407 | # if the node has cost no greater than num_hops, include it in the update schedule 408 | if cost <= num_hops: 409 | self._bp_schedule.append(v) 410 | 411 | if v not in seen: 412 | # now the node v has its shortest distance to the starting nodes. 413 | seen.add(v) 414 | cur_node = self._nodes[v] 415 | for n in cur_node._neighbors: 416 | if n not in seen: 417 | prev = min_costs.get(n, None) 418 | next = cost + 1 419 | if prev is None or next < prev: 420 | min_costs[n] = next 421 | heappush(q, myTuple(next, n)) 422 | return None 423 | 424 | @timer 425 | def run_bp(self, start_iter=0, max_iters=-1, early_stop_at=1, tol=1e-3): 426 | """ run belief propagation on the graph for MaxIters iterations 427 | Args: 428 | start_iter: continuing from the results of previous iterations 429 | max_iters: how many iterations to run BP. Default use the SpEagle's parameter 430 | early_stop_at: the percentage of nodes whose out-going messages will be updated 431 | tol: threshold of message differences of one iteration, below which exit BP 432 | Return: 433 | delta: the difference in messages before and after iterations of message passing 434 | """ 435 | stop_at = int(len(self._bp_schedule) * early_stop_at) 436 | 437 | if max_iters == -1: 438 | max_iters = self._max_iters 439 | 440 | for it in range(start_iter, start_iter + max_iters, 1): 441 | if it % 2 == 0: 442 | start = stop_at - 1 443 | end = -1 444 | step = -1 445 | else: 446 | start = 0 447 | end = stop_at 448 | step = 1 449 | p = start 450 | total_updates = 0 451 | delta = 0 452 | while p != end: 453 | total_updates += 1 454 | cur_node = self._nodes[self._bp_schedule[p]] 455 | p += step 456 | delta += cur_node.recompute_outgoing(self._potentials, self._nodes) 457 | if total_updates > stop_at: 458 | break 459 | delta /= total_updates 460 | # print('bp_iter = %d, delta = %f\n' % (it, delta)) 461 | if abs(delta) < tol: 462 | break 463 | return delta 464 | 465 | @timer 466 | def classify(self): 467 | """ read out the id of the maximal entry of each belief vector 468 | Return: 469 | userBelief: beliefs of the users 470 | reviewBelief: beliefs of the reviews 471 | prodBelief: beliefs of the products 472 | """ 473 | userBelief= {} 474 | reviewBelief= {} 475 | prodBelief= {} 476 | 477 | for k, n in self._nodes.items(): 478 | # decide the type of the node and find its original name 479 | node_type = None 480 | if isinstance(k, tuple): 481 | node_type = 'review' 482 | u_id = k[0][1:] 483 | p_id = k[1][1:] 484 | review_id = (u_id, p_id) 485 | else: 486 | if k[0] == 'u': 487 | node_type = 'user' 488 | user_id = k[1:] 489 | else: 490 | node_type = 'product' 491 | prod_id = k[1:] 492 | 493 | belief, _ = n.get_belief(self._nodes) 494 | 495 | # from log scale to prob scale and normalize to prob distribution 496 | posterior_med = np.exp(belief) 497 | posterior = posterior_med / np.sum(posterior_med) 498 | 499 | if node_type == 'review': 500 | reviewBelief[review_id] = posterior[1] 501 | elif node_type == 'user': 502 | userBelief[user_id] = posterior[1] 503 | elif node_type == 'product': 504 | prodBelief[prod_id] = posterior[1] 505 | else: 506 | continue 507 | 508 | return userBelief, reviewBelief, prodBelief 509 | 510 | 511 | if __name__ == '__main__': 512 | prefix = '/Users/dozee/Desktop/Reseach/Spam_Detection/Dataset/YelpChi/' 513 | metadata_filename = prefix + 'metadata.gz' 514 | 515 | # prior file names 516 | user_prior_filename = prefix + 'UserPriors.pickle' 517 | prod_prior_filename = prefix + 'ProdPriors.pickle' 518 | review_prior_filename = prefix + 'ReviewPriors.pickle' 519 | 520 | # read the graph and node priors 521 | user_product_graph, product_user_graph = read_graph_data(metadata_filename) 522 | 523 | with open(user_prior_filename, 'rb') as f: 524 | user_priors = pickle.load(f) 525 | 526 | with open(prod_prior_filename, 'rb') as f: 527 | prod_priors = pickle.load(f) 528 | 529 | with open(review_prior_filename, 'rb') as f: 530 | review_priors = pickle.load(f) 531 | 532 | # print(user_priors) 533 | # set up edge potentials 534 | ''' 535 | User and Review potential 536 | [1,0] 537 | [0,1] 538 | Reviewer and Review potential 539 | [1 - eps, eps] 540 | [eps, 1 - eps] 541 | ''' 542 | numerical_eps = 1e-5 543 | user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]])) 544 | eps = 0.1 545 | review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]])) 546 | 547 | potentials = {'u_r': user_review_potential, 'r_u': user_review_potential, 548 | 'r_p': review_product_potential, 'p_r': review_product_potential} 549 | 550 | model = SpEagle(user_product_graph, [user_priors, prod_priors, review_priors], potentials, max_iters=100) 551 | model.schedule() 552 | model.run_bp() 553 | 554 | -------------------------------------------------------------------------------- /UGFraud/Detector/ZooBP.py: -------------------------------------------------------------------------------- 1 | """ 2 | ZooBP: Belief Propagation for Heterogeneous Networks. 3 | A method to perform fast BP on undirected heterogeneous graphs with provable convergence guarantees. 4 | Article: http://www.vldb.org/pvldb/vol10/p625-eswaran.pdf 5 | """ 6 | 7 | from UGFraud.Utils.helper import timer 8 | from scipy.special import logsumexp 9 | from scipy import sparse 10 | from collections import defaultdict 11 | import numpy as np 12 | import networkx as nx 13 | 14 | 15 | def Initialize_Final_Beliefs(N1, N2, m): 16 | """ 17 | Initialization of final beliefs 18 | Args: 19 | N1: number of users 20 | N2: number of products 21 | m: coefficient for reduction in beliefs 22 | Returns: 23 | Concatenation of initialized final beliefs for users and products 24 | Example of return values: -0.5 0.5 -0.3 0.3 ... 25 | """ 26 | r1 = m * (np.random.uniform(size=N1) - 0.5) 27 | r1 = r1.reshape(r1.shape[0], 1) 28 | r2 = m * (np.random.uniform(size=N2) - 0.5) 29 | r2 = r2.reshape(r2.shape[0], 1) 30 | B1 = np.concatenate((r1, -r1), axis=1) 31 | B2 = np.concatenate((r2, -r2), axis=1) 32 | 33 | temp1_B = B1.reshape((B1.shape[1] * B1.shape[0], 1)) 34 | temp2_B = B2.reshape((B2.shape[1] * B2.shape[0], 1)) 35 | B = np.concatenate((temp1_B, temp2_B), axis=0) 36 | 37 | return B 38 | 39 | 40 | class ZooBP: 41 | def __init__(self, graph, ep, H): 42 | """ 43 | implementation of ZooBP in python 44 | Args: 45 | graph: a networkx graph 46 | ep: interaction strength 47 | H: compatibility matrix 48 | Returns: 49 | final_user_beliefs: centered version of final user beliefs 50 | final_prod_beliefs centered version of final prod beliefs 51 | NOTE: 52 | ZooBP requires consecutive ids not ids with gaps 53 | """ 54 | a_list_temp = nx.get_edge_attributes(graph, 'rating') 55 | n, p = list(zip(*list(a_list_temp.keys()))) 56 | reversed_dict = defaultdict(list) 57 | node_types_index = nx.get_node_attributes(graph, 'types') 58 | for key, value in node_types_index.items(): 59 | reversed_dict[value].append(key) 60 | self.a_list = np.array(list(zip(n, p, a_list_temp.values())), dtype=np.int32) 61 | u_priors = dict() 62 | p_priors = dict() 63 | node_prior_index = nx.get_node_attributes(graph, 'prior') 64 | for i in reversed_dict['user']: 65 | u_priors[i] = node_prior_index[i] 66 | for i in reversed_dict['prod']: 67 | p_priors[i] = node_prior_index[i] 68 | self.u_tag, user_priors = zip(*u_priors.items()) 69 | self.u_priors = np.array(user_priors) 70 | self.p_tag, prod_priors = zip(*p_priors.items()) 71 | self.p_priors = np.array(prod_priors) 72 | self.ep = ep 73 | self.H = H 74 | 75 | @timer 76 | def run(self): 77 | # converts the given priors to the centered version 78 | user_priors = self.u_priors - 0.5 * np.ones((self.u_priors.shape[0])) 79 | prod_priors = self.p_priors - 0.5 * np.ones((self.p_priors.shape[0])) 80 | # finds positive (1) and negative (2) edges and reshapes them 81 | rating = self.a_list[:, 2] 82 | self.a_list[self.a_list[:, 2] == 2] = 2 83 | self.a_list[self.a_list[:, 2] == 1] = 1 84 | edges_pos = self.a_list[rating == 1] 85 | edges_neg = self.a_list[rating == 2] 86 | Lpos = edges_pos[:, 0:2] 87 | Lpos = Lpos.reshape((edges_pos.shape[0], 2)) 88 | Lneg = edges_neg[:, 0:2] 89 | Lneg = Lneg.reshape((edges_neg.shape[0], 2)) 90 | n_user = user_priors.shape[0] 91 | n_prod = prod_priors.shape[0] 92 | 93 | # computes A+ and A- as defined in section 4.7 of ZooBP 94 | lpos_0 = Lpos[:, 0] - np.ones(Lpos[:, 0].shape[0]) 95 | lpos_1 = Lpos[:, 1] - np.ones(Lpos[:, 1].shape[0]) 96 | Apos = sparse.coo_matrix((np.ones(Lpos.shape[0]), (lpos_0, lpos_1)), shape=(n_user, n_prod)) 97 | lneg_0 = Lneg[:, 0] - np.ones(Lneg[:, 0].shape[0]) 98 | lneg_1 = Lneg[:, 1] - np.ones(Lneg[:, 1].shape[0]) 99 | Aneg = sparse.coo_matrix((np.ones(len(Lneg)), (lneg_0, lneg_1)), shape=(n_user, n_prod)) 100 | 101 | # prior beliefs are reshaped so that user1_belief 1-user1_belief ... prod1_belief 1-prod1_belief 102 | h_user_priors = np.reshape(user_priors, (len(user_priors), -1)) 103 | h_prod_priors = np.reshape(prod_priors, (len(prod_priors), -1)) 104 | user_priors = np.hstack((h_user_priors, -h_user_priors)) 105 | prod_priors = np.hstack((h_prod_priors, -h_prod_priors)) 106 | reshape_u = user_priors.reshape((2 * n_user, 1)) 107 | reshape_p = prod_priors.reshape((2 * n_prod, 1)) 108 | E = np.concatenate((reshape_u, reshape_p)) 109 | 110 | # build P defined under section 4.7 of ZooBP 111 | R = sparse.kron(Apos - Aneg, self.ep * self.H) 112 | sp1 = sparse.coo_matrix((2 * n_user, 2 * n_user), dtype=np.int8) 113 | temp1 = sparse.hstack([sp1, 0.5 * R]) 114 | sp2 = sparse.coo_matrix((2 * n_prod, 2 * n_prod), dtype=np.int8) 115 | temp2 = sparse.hstack([0.5 * R.transpose(), sp2]) 116 | P = sparse.vstack((temp1, temp2)) 117 | P = P.transpose() 118 | 119 | # build Q defined under section 4.7 of ZooBP 120 | sum_temp = Apos + Aneg 121 | temp1 = sum_temp.sum(axis=1) 122 | temp2 = sum_temp.sum(axis=0) 123 | D12 = sparse.diags(np.asarray(temp1.flatten()).reshape(-1)) 124 | D21 = sparse.diags(np.asarray(temp2.flatten()).reshape(-1)) 125 | temp = 0.25 * self.ep * self.ep * sparse.kron(D12, self.H) 126 | Q_1 = sparse.eye(n_user * 2) + temp 127 | Q_2 = sparse.eye(n_prod * 2) + (0.25 * self.ep * self.ep) * (sparse.kron(D21, self.H)) 128 | sp1 = sparse.coo_matrix((n_user * 2, n_prod * 2), dtype=np.int8) 129 | Q_temp1 = sparse.hstack((Q_1, sp1)) 130 | sp2 = sparse.coo_matrix((n_prod * 2, n_user * 2), dtype=np.int8) 131 | Q_temp2 = sparse.hstack((sp2, Q_2)) 132 | Q = sparse.vstack((Q_temp1, Q_temp2)) 133 | 134 | # M 135 | M = P - Q + sparse.eye(2 * (n_user + n_prod)) 136 | M = M.transpose() 137 | B = Initialize_Final_Beliefs(n_user, n_prod, 0.001) 138 | 139 | # Iterative Solution 140 | res = 1 141 | while (res > 1e-8): 142 | Bold = B 143 | # Equations (13) and (14) in ZooBP 144 | B = E + logsumexp(M * Bold) 145 | res = np.sum(np.sum(abs(Bold - B))) 146 | 147 | B1 = B[0:2 * n_user, :] 148 | B2 = B[2 * n_user:, :] 149 | user_beliefs = B1.reshape((n_user, 2)) 150 | user_beliefs = dict(zip(self.u_tag, user_beliefs[:, 0])) 151 | prod_beliefs = B2.reshape((n_prod, 2)) 152 | prod_beliefs = dict(zip(self.p_tag, prod_beliefs[:, 0])) 153 | 154 | return user_beliefs, prod_beliefs 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /UGFraud/Detector/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /UGFraud/Detector/fBox.py: -------------------------------------------------------------------------------- 1 | """ 2 | 'Spotting Suspicious Link Behavior with fBox: An Adversarial Perspective.' 3 | An algorithm designed to catch small-scale, stealth attacks that slip below the radar. 4 | Article: https://arxiv.org/pdf/1410.3915.pdf 5 | """ 6 | 7 | from UGFraud.Utils.helper import timer 8 | from numpy.linalg import * 9 | from scipy.sparse import csr_matrix 10 | from scipy.sparse.linalg import svds 11 | import numpy as np 12 | 13 | 14 | class fBox(): 15 | def __init__(self, graph): 16 | """ 17 | fBox only takes a binary user-product graph 18 | graph: a networkx graph 19 | """ 20 | self.u_id2idx = {} 21 | self.idx2u_id = {} 22 | self.p_id2idx = {} 23 | self.idx2p_id = {} 24 | 25 | # construct a sparse matrix from the graph 26 | row_idx = [] 27 | col_idx = [] 28 | data = [] 29 | 30 | user_idx = 0 31 | product_idx = 0 32 | for k in graph.edges(): 33 | if k[0] not in self.u_id2idx: 34 | self.u_id2idx[k[0]] = user_idx 35 | self.idx2u_id[user_idx] = k[0] 36 | user_idx += 1 37 | 38 | if k[1] not in self.p_id2idx: 39 | self.p_id2idx[k[1]] = product_idx 40 | self.idx2p_id[product_idx] = k[1] 41 | product_idx += 1 42 | 43 | row_idx.append(self.u_id2idx[k[0]]) 44 | col_idx.append(self.p_id2idx[k[1]]) 45 | data.append(1) 46 | 47 | self.num_users = user_idx 48 | self.num_products = product_idx 49 | self.matrix = csr_matrix((data, (row_idx, col_idx)), shape=(user_idx, product_idx)).asfptype() 50 | 51 | @timer 52 | def run(self, tau, k): 53 | """ 54 | run the algorithm. 55 | tau: the percentile in reconstructed degree threshold under which a node is considered suspicious 56 | """ 57 | # k = 50 is selected based on Figure 3 of the paper 58 | u, s, vt = svds(self.matrix, k=k) 59 | # reconstructed out degree 60 | self.recOutDeg = norm(u.dot(np.diag(s)), axis=1) 61 | # reconstructed in degree 62 | self.recInDeg = norm(vt.T.dot(np.diag(s)), axis=1) 63 | 64 | # detect users 65 | out_deg = self.matrix.sum(axis=1) 66 | self.out_deg = np.array(out_deg).reshape(-1, ) 67 | self.unique_out_deg = np.unique(self.out_deg) 68 | 69 | # store the indices of suspicious users 70 | suspicious_users = {} 71 | thresholds = {} 72 | for d in self.unique_out_deg: 73 | # find users with original degree = d 74 | users = (self.out_deg == d) 75 | user_deg = self.recOutDeg[users] 76 | thresholds[d] = np.percentile(user_deg, tau) 77 | 78 | for i in range(self.num_users): 79 | user_d = self.out_deg[i] 80 | if self.recOutDeg[i] < thresholds[user_d]: 81 | 82 | if user_d not in suspicious_users: 83 | suspicious_users[user_d] = [] 84 | suspicious_users[user_d].append(self.idx2u_id[i]) 85 | 86 | # detect products 87 | in_deg = self.matrix.sum(axis=0) 88 | self.in_deg = np.array(in_deg).reshape(-1, ) 89 | self.unique_in_deg = np.unique(self.in_deg) 90 | 91 | # store the indices of suspicious users 92 | suspicious_products = {} 93 | thresholds = {} 94 | 95 | for d in self.unique_in_deg: 96 | prods = (self.in_deg == d) 97 | prod_deg = self.recInDeg[prods] 98 | thresholds[d] = np.percentile(prod_deg, tau) 99 | 100 | for i in range(self.num_products): 101 | prod_d = self.in_deg[i] 102 | if self.recInDeg[i] < thresholds[prod_d]: 103 | if prod_d not in suspicious_products: 104 | suspicious_products[prod_d] = [] 105 | suspicious_products[prod_d].append(self.idx2p_id[i]) 106 | 107 | return suspicious_users, suspicious_products 108 | 109 | def get_srms(self): 110 | """ 111 | return two matrices one for use the other for products 112 | each matrix has rows as reconstruction degree and column as old degree in the graph. 113 | """ 114 | 115 | hist, edges = np.histogram(self.recOutDeg, bins=100) 116 | data = [] 117 | rows = [] 118 | cols = [] 119 | 120 | for d in self.unique_out_deg: 121 | user_deg = self.recOutDeg[self.out_deg == d] 122 | bin_indices = np.digitize(user_deg, edges) 123 | for i in bin_indices: 124 | data.append(1) 125 | rows.append(i) 126 | cols.append(d) 127 | 128 | self.osrm = csr_matrix((data, (rows, cols)), shape=(len(edges) + 1, max(self.unique_out_deg) + 1)) 129 | 130 | hist, edges = np.histogram(self.recInDeg, bins=10) 131 | data = [] 132 | rows = [] 133 | cols = [] 134 | for d in self.unique_in_deg: 135 | prod_deg = self.recInDeg[self.in_deg == d] 136 | bin_indices = np.digitize(prod_deg, edges) 137 | for i in bin_indices: 138 | data.append(1) 139 | rows.append(i) 140 | cols.append(d) 141 | self.isrm = csr_matrix((data, (rows, cols))) 142 | 143 | return self.osrm, self.isrm 144 | -------------------------------------------------------------------------------- /UGFraud/Utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /UGFraud/Utils/helper.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import average_precision_score 2 | from sklearn.metrics import roc_auc_score 3 | import gzip 4 | import numpy as np 5 | import networkx as nx 6 | import time 7 | import functools 8 | import warnings 9 | 10 | 11 | def create_ground_truth(user_data): 12 | """Given user data, return a dictionary of labels of users and reviews 13 | Args: 14 | user_data: key = user_id, value = list of review tuples. 15 | Return: 16 | user_ground_truth: key = user id (not prefixed), value = 0 (non-spam) /1 (spam) 17 | review_ground_truth: review id (not prefixed), value = 0 (non-spam) /1 (spam) 18 | """ 19 | user_ground_truth = {} 20 | review_ground_truth = {} 21 | 22 | for user_id, reviews in user_data.items(): 23 | 24 | user_ground_truth[user_id] = 0 25 | 26 | for r in reviews: 27 | prod_id = r[0] 28 | label = r[2] 29 | 30 | if label == -1: 31 | review_ground_truth[(user_id, prod_id)] = 1 32 | user_ground_truth[user_id] = 1 33 | else: 34 | review_ground_truth[(user_id, prod_id)] = 0 35 | 36 | return user_ground_truth, review_ground_truth 37 | 38 | 39 | def evaluate(y, pred_y): 40 | """ 41 | Revise: test when a key is a review/account. 42 | Evaluate the prediction of account and review by SpEagle 43 | Args: 44 | y: dictionary with key = user_id/review_id and value = ground truth (1 means spam, 0 means non-spam) 45 | pred_y: dictionary with key = user_id/review_id and value = p(y=spam | x) produced by SpEagle. 46 | the keys in pred_y must be a subset of the keys in y 47 | """ 48 | posteriors = [] 49 | ground_truth = [] 50 | 51 | for k, v in pred_y.items(): 52 | if k in y: 53 | posteriors.append(v) 54 | ground_truth.append(y[k]) 55 | 56 | if len(np.unique(ground_truth)) < 2: 57 | warnings.warn("Only one class present in ground_truth, ROC AUC score will be omitted") 58 | ap = average_precision_score(ground_truth, posteriors) 59 | return None, ap 60 | else: 61 | auc = roc_auc_score(ground_truth, posteriors) 62 | ap = average_precision_score(ground_truth, posteriors) 63 | return auc, ap 64 | 65 | 66 | def scale_value(value_dict): 67 | """ 68 | Calculate and return a dict of the value of input dict scaled to (0, 1) 69 | """ 70 | 71 | ranked_dict = [(user, value_dict[user]) for user in value_dict.keys()] 72 | ranked_dict = sorted(ranked_dict, reverse=True, key=lambda x: x[1]) 73 | 74 | up_max, up_mean, up_min = ranked_dict[0][1], ranked_dict[int(len(ranked_dict) / 2)][1], ranked_dict[-1][1] 75 | 76 | scale_dict = {} 77 | for i, p in value_dict.items(): 78 | norm_value = (p - up_min) / (up_max - up_min) 79 | if norm_value == 0: # avoid the 0 80 | scale_dict[i] = 0 + 1e-7 81 | elif norm_value == 1: # avoid the 1 82 | scale_dict[i] = 1 - 1e-7 83 | else: 84 | scale_dict[i] = norm_value 85 | 86 | return scale_dict 87 | 88 | 89 | def nor_priors(priors): 90 | """ 91 | Normalize the node priors for GANG 92 | :param priors: 93 | :return: 94 | """ 95 | new_upriors, new_rpriors, new_ppriors = priors 96 | 97 | # normalize the node priors to (0,1) 98 | # if we normalize the prior, we need to set nor_flg to True for the gang model 99 | ranked_upriors = [(user, new_upriors[user]) for user in new_upriors.keys()] 100 | ranked_upriors = sorted(ranked_upriors, reverse=True, key=lambda x: x[1]) 101 | ranked_rpriors = [(user, new_rpriors[user]) for user in new_rpriors.keys()] 102 | ranked_rpriors = sorted(ranked_rpriors, reverse=True, key=lambda x: x[1]) 103 | ranked_ppriors = [(user, new_ppriors[user]) for user in new_ppriors.keys()] 104 | ranked_ppriors = sorted(ranked_ppriors, reverse=True, key=lambda x: x[1]) 105 | u_max, u_mean, u_min = ranked_upriors[0][1], ranked_upriors[int(len(ranked_upriors) / 2)][1], ranked_upriors[-1][1] 106 | p_max, p_mean, p_min = ranked_ppriors[0][1], ranked_ppriors[int(len(ranked_ppriors) / 2)][1], ranked_ppriors[-1][1] 107 | r_max, r_mean, r_min = ranked_rpriors[0][1], ranked_rpriors[int(len(ranked_rpriors) / 2)][1], ranked_rpriors[-1][1] 108 | for i, p in priors[0].items(): 109 | priors[0][i] = (p - u_min) / (u_max - u_min) 110 | for i, p in priors[1].items(): 111 | priors[1][i] = (p - r_min) / (r_max - r_min) 112 | for i, p in priors[2].items(): 113 | priors[2][i] = (p - p_min) / (p_max - p_min) 114 | 115 | return priors, [u_mean, r_mean, p_mean] 116 | 117 | 118 | def get_hash(data): 119 | import hashlib 120 | return hashlib.md5(data).hexdigest() 121 | 122 | 123 | def read_graph_data(metadata_filename, adj=False): 124 | """ Read the user-review-product graph from file. Can output the graph in different formats 125 | Args: 126 | metadata_filename: a gzipped file containing the graph. 127 | adj: if True: create adjacent data, default is False 128 | Return: 129 | graph: user-review / prod-review / list of adjacent(adj=True) 130 | """ 131 | 132 | user_data = {} 133 | 134 | prod_data = {} 135 | 136 | adj_data = [] 137 | 138 | # use the rt mode to read ascii strings instead of binary 139 | if adj is False: 140 | with gzip.open(metadata_filename, 'rt') as f: 141 | # file format: each line is a tuple (user id, product id, rating, label, date) 142 | for line in f: 143 | items = line.strip().split() 144 | u_id = items[0] 145 | p_id = items[1] 146 | if items[2] != 'None': 147 | rating = float(items[2]) 148 | else: 149 | rating = 'None' 150 | label = int(items[3]) 151 | date = items[4] 152 | 153 | if u_id not in user_data: 154 | user_data[u_id] = [] 155 | user_data[u_id].append((p_id, rating, label, date)) 156 | 157 | if p_id not in prod_data: 158 | prod_data[p_id] = [] 159 | prod_data[p_id].append((u_id, rating, label, date)) 160 | 161 | # create adj_list [u_id, p_id, 1/2], where 1 indicates positive rating (4, 5) 162 | # and 2 indicates negative rating (1, 2, 3) 163 | 164 | print('read reviews from %s' % metadata_filename) 165 | print('number of users = %d' % len(user_data)) 166 | print('number of products = %d' % len(prod_data)) 167 | return user_data, prod_data 168 | else: 169 | # create adj_list [u_id, p_id, 1/2], where 1 indicates positive rating (4, 5) 170 | # and 2 indicates negative rating (1, 2, 3) 171 | with gzip.open(metadata_filename, 'rt') as f: 172 | # file format: each line is a tuple (user id, product id, rating, label, date) 173 | for line in f: 174 | items = line.strip().split() 175 | u_id = items[0] 176 | p_id = items[1] 177 | if items[2] != 'None': 178 | rating = float(items[2]) 179 | else: 180 | rating = 'None' 181 | label = int(items[3]) 182 | date = items[4] 183 | 184 | if u_id not in user_data: 185 | user_data[u_id] = [] 186 | user_data[u_id].append((p_id, rating, label, date)) 187 | 188 | if p_id not in prod_data: 189 | prod_data[p_id] = [] 190 | prod_data[p_id].append((u_id, rating, label, date)) 191 | 192 | if int(rating) <= 3: 193 | rating = int(2) 194 | else: 195 | rating = int(1) 196 | adj_data.append([u_id, p_id, rating]) 197 | 198 | print('read reviews from %s' % metadata_filename) 199 | print('number of users = %d' % len(user_data)) 200 | print('number of products = %d' % len(prod_data)) 201 | print('number of ratings = %d' % len(adj_data)) 202 | return user_data, prod_data, np.array(adj_data, dtype='int32') 203 | 204 | 205 | def depth(data): 206 | """ 207 | Get the depth of a dictionary 208 | Args: 209 | data: data in dictionary type 210 | 211 | Returns: the depth of a dictionary 212 | 213 | """ 214 | if isinstance(data, dict): 215 | return 1 + (max(map(depth, data.values())) if data else 0) 216 | return 0 217 | 218 | 219 | def data_checker(data): 220 | """ 221 | data validation 222 | Args: 223 | data: data in dictionary type 224 | 225 | Returns: pass the validation 226 | 227 | """ 228 | if isinstance(data, dict): 229 | if depth(data) < 3: 230 | raise Exception("The minimum depth of data must be 3. For example: {\'node1\':{\'node1_neighbor\':{" 231 | "neighbor's attribute}}}") 232 | else: 233 | raise AttributeError("Data must be stored in dictionary.") 234 | 235 | 236 | def dict_to_networkx(data): 237 | """ 238 | Convert data into networkx graph 239 | Args: 240 | data: data in dictionary type 241 | 242 | Returns: networkx graph 243 | 244 | """ 245 | data_checker(data) 246 | G = nx.Graph(data) 247 | return G 248 | 249 | 250 | def add_attribute_to_graph(graph, attribute, adding_type): 251 | """ 252 | Add new attributes to nodes/edges 253 | Args: 254 | graph: networkx graph 255 | attribute: dictionary of attributes for nodes/edges 256 | adding_type: string of node or edge 257 | 258 | Returns: 259 | networkx graph with new attributes 260 | """ 261 | if isinstance(attribute, dict): 262 | if isinstance(graph, nx.classes.graph.Graph): 263 | if adding_type == 'node': 264 | nx.set_node_attributes(graph, attribute) 265 | return graph 266 | elif adding_type == 'edge': 267 | nx.set_edge_attributes(graph, attribute) 268 | return graph 269 | else: 270 | raise Exception("Adding type must be \'node\' or \'edge\'.") 271 | else: 272 | raise Exception("The graph must be a networkx graph.") 273 | else: 274 | raise AttributeError("Attribute must be stored in dictionary.") 275 | 276 | 277 | def get_node_attributes_index(graph, attr): 278 | """ 279 | get node index for each attributes 280 | Args: 281 | graph: networkx graph 282 | attr: nodes' attribute 283 | 284 | Returns: 285 | a dict of list which contains every attribute index 286 | For example: {'user': ['201','202','203','204'], 'prod': ['0', '1', '2']} 287 | """ 288 | from collections import defaultdict 289 | node_temp = nx.get_node_attributes(graph, attr) 290 | reversed_dict = defaultdict(list) 291 | for key, value in node_temp.items(): 292 | reversed_dict[value].append(key) 293 | return reversed_dict 294 | 295 | 296 | def get_edge_attributes_index(graph, attr): 297 | """ 298 | get edge index for each attributes 299 | Args: 300 | graph: networkx graph 301 | attr: edges' attribute 302 | 303 | Returns: 304 | a dict of list which contains every attribute index 305 | For example: {'review': [('201', '0'), ('202', '0'), ('203', '0'), ('204', '0')]} 306 | """ 307 | from collections import defaultdict 308 | node_temp = nx.get_edge_attributes(graph, attr) 309 | reversed_dict = defaultdict(list) 310 | for key, value in node_temp.items(): 311 | reversed_dict[value].append(key) 312 | return reversed_dict 313 | 314 | 315 | def node_attr_filter(graph, attr, specific_attr, into_attr): 316 | """ 317 | get specific keys, values in conditions 318 | Args: 319 | graph: networkx graph 320 | attr: which attribute index you want to get 321 | specific_attr: which specific attribute index you want to get depending on attr 322 | into_attr: use specific attribute index to filter the attribute 323 | 324 | Returns: 325 | dict(node: into_attr values) 326 | For example: node_attr_filter(graph, 'types', 'user', 'prior) 327 | will return the dict( user_id: user_id_prior) 328 | 329 | """ 330 | attr_dict_index = get_node_attributes_index(graph, attr) 331 | specific_dict = attr_dict_index[specific_attr] 332 | filtered_dict = dict() 333 | into_dict = nx.get_node_attributes(graph, into_attr) 334 | for i in specific_dict: 335 | filtered_dict[i] = into_dict[i] 336 | return filtered_dict 337 | 338 | 339 | def edge_attr_filter(graph, attr, specific_attr, into_attr): 340 | """ 341 | get specific keys, values in conditions 342 | Args: 343 | graph: networkx graph 344 | attr: which attribute index you want to get 345 | specific_attr: which specific attribute index you want to get depending on attr 346 | into_attr: use specific attribute index to filter the attribute 347 | 348 | Returns: 349 | dict(edge: into_attr values) 350 | For example: edge_attr_filter(graph, 'types', 'review', 'prior) 351 | will return the dict(review_id: review_id_prior) 352 | 353 | """ 354 | attr_dict_index = get_edge_attributes_index(graph, attr) 355 | specific_dict = attr_dict_index[specific_attr] 356 | filtered_dict = dict() 357 | into_dict = nx.get_edge_attributes(graph, into_attr) 358 | for i in specific_dict: 359 | filtered_dict[i] = into_dict[i] 360 | return filtered_dict 361 | 362 | 363 | def save_graph(graph, graph_name=False): 364 | """ 365 | 366 | Args: 367 | graph: network graph 368 | graph_name: the file name of the graph, if graph_name=False, use default name 369 | 370 | Returns: 371 | None 372 | """ 373 | from networkx.readwrite import json_graph 374 | import json 375 | data = json_graph.node_link_data(graph) 376 | if graph_name is False: 377 | graph_name = 'graph_data.json' 378 | with open(graph_name, 'w') as f: 379 | json.dump(data, f) 380 | f.close() 381 | print('Saved graph data as {}'.format(graph_name)) 382 | 383 | 384 | def load_graph(json_name): 385 | """ 386 | 387 | Args: 388 | json_name: json file name 389 | 390 | Returns: 391 | networkx graph 392 | """ 393 | from networkx.readwrite import json_graph 394 | import json 395 | with open(json_name, 'r') as f: 396 | data = json.load(f) 397 | f.close() 398 | graph = json_graph.node_link_graph(data) 399 | print('Loaded {} into the nextorkx graph'.format(json_name)) 400 | return graph 401 | 402 | 403 | def timer(func): 404 | """Print the runtime of the decorated function""" 405 | @functools.wraps(func) 406 | def wrapper_timer(*args, **kwargs): 407 | start_time = time.perf_counter() 408 | value = func(*args, **kwargs) 409 | end_time = time.perf_counter() 410 | run_time = end_time - start_time 411 | print("Finished {} in {} secs".format(func.__name__, round(run_time, 3))) 412 | return value 413 | return wrapper_timer 414 | 415 | -------------------------------------------------------------------------------- /UGFraud/Yelp_Data/YelpChi/metadata.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/safe-graph/UGFraud/b47ac521d6a0fcc1d8880619275c9d48ccfa2997/UGFraud/Yelp_Data/YelpChi/metadata.gz -------------------------------------------------------------------------------- /UGFraud/Yelp_Data/YelpChi/priors.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/safe-graph/UGFraud/b47ac521d6a0fcc1d8880619275c9d48ccfa2997/UGFraud/Yelp_Data/YelpChi/priors.pkl -------------------------------------------------------------------------------- /UGFraud/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- -------------------------------------------------------------------------------- /UGFraud_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/safe-graph/UGFraud/b47ac521d6a0fcc1d8880619275c9d48ccfa2997/UGFraud_logo.png -------------------------------------------------------------------------------- /reference/fbox.txt: -------------------------------------------------------------------------------- 1 | @inproceedings{shah2014spotting, 2 | title={Spotting suspicious link behavior with fbox: An adversarial perspective}, 3 | author={Shah, Neil and Beutel, Alex and Gallagher, Brian and Faloutsos, Christos}, 4 | booktitle={2014 IEEE International Conference on Data Mining}, 5 | pages={959--964}, 6 | year={2014}, 7 | organization={IEEE} 8 | } -------------------------------------------------------------------------------- /reference/fraudar.txt: -------------------------------------------------------------------------------- 1 | @inproceedings{hooi2016fraudar, 2 | title={Fraudar: Bounding graph fraud in the face of camouflage}, 3 | author={Hooi, Bryan and Song, Hyun Ah and Beutel, Alex and Shah, Neil and Shin, Kijung and Faloutsos, Christos}, 4 | booktitle={Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining}, 5 | pages={895--904}, 6 | year={2016} 7 | } -------------------------------------------------------------------------------- /reference/gang.txt: -------------------------------------------------------------------------------- 1 | @inproceedings{wang2017gang, 2 | title={GANG: Detecting fraudulent users in online social networks via guilt-by-association on directed graphs}, 3 | author={Wang, Binghui and Gong, Neil Zhenqiang and Fu, Hao}, 4 | booktitle={2017 IEEE International Conference on Data Mining (ICDM)}, 5 | pages={465--474}, 6 | year={2017}, 7 | organization={IEEE} 8 | } -------------------------------------------------------------------------------- /reference/speagle.txt: -------------------------------------------------------------------------------- 1 | @inproceedings{rayana2015collective, 2 | title={Collective opinion spam detection: Bridging review networks and metadata}, 3 | author={Rayana, Shebuti and Akoglu, Leman}, 4 | booktitle={Proceedings of the 21th acm sigkdd international conference on knowledge discovery and data mining}, 5 | pages={985--994}, 6 | year={2015} 7 | } -------------------------------------------------------------------------------- /reference/svd.txt: -------------------------------------------------------------------------------- 1 | @incollection{golub1971singular, 2 | title={Singular value decomposition and least squares solutions}, 3 | author={Golub, Gene H and Reinsch, Christian}, 4 | booktitle={Linear Algebra}, 5 | pages={134--151}, 6 | year={1971}, 7 | publisher={Springer} 8 | } -------------------------------------------------------------------------------- /reference/zoobp.txt: -------------------------------------------------------------------------------- 1 | @article{eswaran2017zoobp, 2 | title={Zoobp: Belief propagation for heterogeneous networks}, 3 | author={Eswaran, Dhivya and G{\"u}nnemann, Stephan and Faloutsos, Christos and Makhija, Disha and Kumar, Mohit}, 4 | journal={Proceedings of the VLDB Endowment}, 5 | volume={10}, 6 | number={5}, 7 | pages={625--636}, 8 | year={2017}, 9 | publisher={VLDB Endowment} 10 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | networkx>=2.2 2 | numpy>=1.16.6 3 | scipy>=1.2.3 4 | scikit-learn>=0.20.4 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | from os import path 3 | from io import open # for Python 2 and 3 compatibility 4 | 5 | this_directory = path.abspath(path.dirname(__file__)) 6 | 7 | # read the contents of requirements.txt 8 | with open(path.join(this_directory, 'requirements.txt'), 9 | encoding='utf-8') as f: 10 | requirements = f.read().splitlines() 11 | 12 | with open("README.md", "r") as fh: 13 | long_description = fh.read() 14 | 15 | setuptools.setup( 16 | name="UGFraud", # Replace with your own username 17 | version="0.1.1.2", 18 | author="Yingtong Dou, Chen Wang, Sihong Xie, Guixiang Ma, and UIC BDSC Lab", 19 | author_email="bdscsafegraph@gmail.com", 20 | description="An Unsupervised Graph-based Toolbox for Fraud Detection", 21 | long_description=long_description, 22 | include_package_data=True, 23 | long_description_content_type="text/markdown", 24 | url="https://github.com/safe-graph/UGFraud", 25 | download_url='https://github.com/safe-graph/UGFraud/archive/master.zip', 26 | keywords=['fraud detection', 'anomaly detection', 'graph algorithm', 27 | 'data mining', 'security'], 28 | package_data={ 29 | # If any package contains *.txt or *.rst files, include them: 30 | "UGFraud": ["Yelp_Data/YelpChi/*.gz", "Yelp_Data/YelpChi/*.pkl"]}, 31 | packages=setuptools.find_packages(), 32 | classifiers=[ 33 | 'Programming Language :: Python :: 3.6', 34 | 'Programming Language :: Python :: 3.7', 35 | 'Development Status :: 5 - Production/Stable', 36 | 'Intended Audience :: Education', 37 | 'Intended Audience :: Financial and Insurance Industry', 38 | 'Intended Audience :: Science/Research', 39 | 'Intended Audience :: Developers', 40 | 'Intended Audience :: Information Technology', 41 | 'License :: OSI Approved :: Apache Software License', 42 | 'Operating System :: OS Independent', 43 | ], 44 | python_requires='>=3.6', 45 | ) 46 | -------------------------------------------------------------------------------- /tests/testing.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 4 | from UGFraud.Demo.eval_fBox import * 5 | from UGFraud.Demo.eval_Fraudar import * 6 | from UGFraud.Demo.eval_GANG import * 7 | from UGFraud.Demo.eval_SpEagle import * 8 | from UGFraud.Demo.eval_SVD import * 9 | from UGFraud.Demo.eval_ZooBP import * 10 | from UGFraud.Demo.demo_pre import * 11 | 12 | 13 | sys.path.insert(0, os.path.abspath('../UGFraud/Demo/')) 14 | # data source 15 | file_name = 'Yelp_graph_data.json' 16 | # path_name = sys.path[0] + '/' + file_name 17 | try: 18 | G = load_graph(file_name) 19 | except FileNotFoundError: 20 | data_path = 'UGFraud/Yelp_Data/' 21 | data_to_network_graph(data_path) 22 | G = load_graph(file_name) 23 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label') 24 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label') 25 | 26 | """ 27 | testing fBox 28 | """ 29 | print("*" * 80) 30 | print("Testing fBox") 31 | t = 20 # taus = [0.5, 1, 5, 10, 25, 50, 99] 32 | k = 50 # k = range(10, 51, 10) 33 | serBelief, reviewBelief = runfBox(G, t, k) 34 | reviewBelief = scale_value(reviewBelief) 35 | 36 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 37 | print('review AUC = {}'.format(review_AUC)) 38 | print('review AP = {}'.format(review_AP)) 39 | 40 | """ 41 | testing Fraudar 42 | """ 43 | print("*" * 80) 44 | print("Testing Fraudar") 45 | userBelief, reviewBelief = runFraudar(G, multiple=0) 46 | reviewBelief = scale_value(reviewBelief) 47 | 48 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 49 | print('review AUC = {}'.format(review_AUC)) 50 | print('review AP = {}'.format(review_AP)) 51 | 52 | """ 53 | testing GANG 54 | """ 55 | print("*" * 80) 56 | print("Testing GANG") 57 | # add semi-supervised user information / threshold 58 | sup_per = 0.1 59 | 60 | # run GANG model 61 | model = GANG(G, user_ground_truth, sup_per, nor_flg=True, sup_flg=False) 62 | 63 | # run Linearized Belief Propagation on product-user matrix with 1000 iterations 64 | iteration = 1000 65 | model.pu_lbp(iteration) 66 | userBelief, _, reviewBelief = model.classify() 67 | reviewBelief = scale_value(reviewBelief) 68 | 69 | # evaluation 70 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 71 | print('review AUC = {}'.format(review_AUC)) 72 | print('review AP = {}'.format(review_AP)) 73 | 74 | """ 75 | testing Prior 76 | """ 77 | print("*" * 80) 78 | print("Testing Prior") 79 | # normalize the review prior as the review suspicious belief 80 | rpriors = edge_attr_filter(G, 'types', 'review', 'prior') 81 | reviewBelief = scale_value(rpriors) 82 | 83 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 84 | print('review AUC = {}'.format(review_AUC)) 85 | print('review AP = {}'.format(review_AP)) 86 | 87 | """ 88 | testing SpEagle 89 | """ 90 | print("*" * 80) 91 | print("Testing SpEagle") 92 | # input parameters: numerical_eps, eps, num_iters, stop_threshold 93 | numerical_eps = 1e-5 94 | eps = 0.1 95 | user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]])) 96 | review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]])) 97 | potentials = {'u_r': user_review_potential, 'r_u': user_review_potential, 98 | 'r_p': review_product_potential, 'p_r': review_product_potential} 99 | max_iters = 4 100 | stop_threshold = 1e-3 101 | 102 | model = SpEagle(G, potentials, message=None, max_iters=4) 103 | 104 | # new runbp func 105 | model.schedule(schedule_type='bfs') 106 | 107 | iter = 0 108 | num_bp_iters = 2 109 | model.run_bp(start_iter=iter, max_iters=num_bp_iters, tol=stop_threshold) 110 | 111 | userBelief, reviewBelief, _ = model.classify() 112 | 113 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief) 114 | print('review AUC = {}'.format(review_AUC)) 115 | print('review AP = {}'.format(review_AP)) 116 | 117 | """ 118 | testing SVD 119 | """ 120 | print("*" * 80) 121 | print("Testing SVD") 122 | percent = 0.9 123 | model = SVD(G) 124 | svd_output = model.run(percent) 125 | result = model.evaluate_SVD(svd_output, G) 126 | index = list(map(str, map(int, result[0]))) 127 | userBelief = dict(zip(index, result[1])) 128 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief) 129 | print('review AUC = {}'.format(review_AUC)) 130 | print('review AP = {}'.format(review_AP)) 131 | 132 | """ 133 | testing ZooBP 134 | """ 135 | print("*" * 80) 136 | print("Testing ZooBp") 137 | ep = 0.01 138 | # H: compatibility matrix 139 | H = np.array([[0.5, -0.5], [-0.5, 0.5]]) 140 | 141 | model = ZooBP(G, ep, H) 142 | userBelief, _ = model.run() # result = (user_beliefs, prod_beliefs) 143 | 144 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief) 145 | print('review AUC = {}'.format(review_AUC)) 146 | print('review AP = {}'.format(review_AP)) 147 | --------------------------------------------------------------------------------