├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── UGFraud
    ├── Demo
    │   ├── __init__.py
    │   ├── demo_pre.py
    │   ├── eval_Fraudar.py
    │   ├── eval_GANG.py
    │   ├── eval_Prior.py
    │   ├── eval_SVD.py
    │   ├── eval_SpEagle.py
    │   ├── eval_ZooBP.py
    │   ├── eval_fBox.py
    │   └── testing.py
    ├── Detector
    │   ├── Fraudar.py
    │   ├── GANG.py
    │   ├── MinTree.py
    │   ├── SVD.py
    │   ├── SpEagle.py
    │   ├── ZooBP.py
    │   ├── __init__.py
    │   └── fBox.py
    ├── Utils
    │   ├── __init__.py
    │   └── helper.py
    ├── Yelp_Data
    │   └── YelpChi
    │   │   ├── metadata.gz
    │   │   └── priors.pkl
    └── __init__.py
├── UGFraud_logo.png
├── reference
    ├── fbox.txt
    ├── fraudar.txt
    ├── gang.txt
    ├── speagle.txt
    ├── svd.txt
    └── zoobp.txt
├── requirements.txt
├── setup.py
└── tests
    └── testing.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | **/*.ipynb_checkpoints/
 3 | **/*.ipynb/
 4 | **/__pycache__
 5 | /Detector/__pycache__
 6 | /Utils/__pycache__
 7 | __pycache__
 8 | __pycache__/
 9 | .idea
10 | .idea/
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.6"
4 |   - "3.7"
5 | # command to install dependencies
6 | install:
7 |   - pip install -r requirements.txt
8 | script:
9 |   - python tests/testing.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <br>
  3 |     <a href="https://image.flaticon.com/icons/svg/1671/1671517.svg">
  4 |         <img src="https://github.com/safe-graph/UGFraud/blob/master/UGFraud_logo.png" width="400"/>
  5 |     </a>
  6 |     <br>
  7 | <p>
  8 | <p align="center">
  9 |     <a href="https://travis-ci.org/github/safe-graph/UGFraud">
 10 |         <img alt="Building" src="https://travis-ci.org/safe-graph/UGFraud.svg?branch=master">
 11 |     </a>
 12 |     <a href="https://github.com/safe-graph/UGFraud/blob/master/LICENSE">
 13 |         <img alt="GitHub" src="https://img.shields.io/github/license/safe-graph/UGFraud">
 14 |     </a>
 15 |     <a href="https://pepy.tech/project/ugfraud">
 16 |         <img alt="Downloads" src="https://pepy.tech/badge/ugfraud">
 17 |     </a>
 18 |     <a href="https://pypi.org/project/UGFraud/">
 19 |         <img alt="Pypi version" src="https://img.shields.io/pypi/v/ugfraud">
 20 |     </a>
 21 | </p>
 22 | 
 23 | <h3 align="center">
 24 | <p>An Unsupervised Graph-based Toolbox for Fraud Detection
 25 | </h3>
 26 | 
 27 | **Introduction:** 
 28 | UGFraud is an unsupervised graph-based fraud detection toolbox that integrates several state-of-the-art graph-based fraud detection algorithms. It can be applied to bipartite graphs (e.g., user-product graph), and it can estimate the suspiciousness of both nodes and edges. The implemented models can be found [here](#implemented-models).
 29 | 
 30 | The toolbox incorporates the Markov Random Field (MRF)-based algorithm, dense-block detection-based algorithm, and SVD-based algorithm. For MRF-based algorithms, the users only need the graph structure and the prior suspicious score of the nodes as the input. For other algorithms, the graph structure is the only input.
 31 | 
 32 | Meanwhile, we have a [deep graph-based fraud detection toolbox](https://github.com/safe-graph/DGFraud) which implements state-of-the-art graph neural network-based fraud detectors.
 33 | 
 34 | We welcome contributions on adding new fraud detectors and extending the features of the toolbox. Some of the planned features are listed in [TODO list](#todo-list). 
 35 | 
 36 | If you use the toolbox in your project, please cite the [paper](https://arxiv.org/abs/2006.06069) below and the [algorithms](#implemented-models) you used :
 37 | ```bibtex
 38 | @inproceedings{dou2020robust,
 39 |   title={Robust Spammer Detection by Nash Reinforcement Learning},
 40 |   author={Dou, Yingtong and Ma, Guixiang and Yu, Philip S and Xie, Sihong},
 41 |   booktitle={Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery \& Data Mining},
 42 |   year={2020}
 43 | }
 44 | ```
 45 | 
 46 | **Useful Resources**
 47 | - [PyGOD: A Python Library for Graph Outlier Detection (Anomaly Detection)](https://github.com/pygod-team/pygod)
 48 | - [DGFraud: A Deep Graph-based Fraud Detection Toolbox](https://github.com/safe-graph/DGFraud)
 49 | - [Graph-based Fraud Detection Paper List](https://github.com/safe-graph/graph-fraud-detection-papers) 
 50 | - [Awesome Fraud Detection Papers](https://github.com/benedekrozemberczki/awesome-fraud-detection-papers)
 51 | - [Attack and Defense Papers on Graph Data](https://github.com/safe-graph/graph-adversarial-learning-literature)
 52 | - [PyOD: A Python Toolbox for Scalable Outlier Detection (Anomaly Detection)](https://github.com/yzhao062/pyod)
 53 | - [PyODD: An End-to-end Outlier Detection System](https://github.com/datamllab/pyodds)
 54 | - [DGL: Deep Graph Library](https://github.com/dmlc/dgl)
 55 | - [Outlier Detection DataSets (ODDS)](http://odds.cs.stonybrook.edu/)
 56 | 
 57 | **Table of Contents**
 58 | - [Installation](#installation)
 59 | - [User Guide](#user-guide)
 60 | - [Implemented Models](#implemented-models)
 61 | - [Model Comparison](#model-comparison)
 62 | - [TODO List](#todo-list)
 63 | - [How to Contribute](#how-to-contribute)
 64 | 
 65 | 
 66 | ## Installation
 67 | You can install UGFraud from `pypi`:
 68 | 
 69 | ```bash
 70 | pip install UGFraud
 71 | ```
 72 | 
 73 | or download and install from `github`:
 74 | 
 75 | ```bash
 76 | git clone https://github.com/safe-graph/UGFraud.git
 77 | cd UGFraud
 78 | python setup.py install
 79 | ```
 80 | 
 81 | ### Dataset
 82 | The demo data is not the intact data (`rating` and `date` information are missing). The rating information is only used in ZooBP demo. If you need the intact date to play demo, please email [bdscsafegraph@gmail.com](mailto:bdscsafegraph@gmail.com) to download the intact data from [Yelp Spam Review Dataset](http://odds.cs.stonybrook.edu/yelpchi-dataset/). The `metadata.gz` file in `/UGFraud/Yelp_Data/YelpChi` includes:
 83 | - `user_id`: 38063 number of users
 84 | - `product_id`: 201 number of products
 85 | - `rating`: from 1.0 (low) to 5.0 (high)
 86 | - `label`: -1 is not spam, 1 is spam
 87 | - `date`: data creation time
 88 | 
 89 | 
 90 | ## User Guide
 91 | 
 92 | ### Running the example code
 93 | You can find the implemented models in `/UGFraud/Demo/` directory. For example, you can run fBox using:
 94 | ```bash
 95 | python eval_fBox.py 
 96 | ```
 97 | 
 98 | ### Running on your datasets
 99 | Check out the `data_to_network_graph` function in `/UGFraud/Demo/demo_pre.py` to convert your data into the [networkx](https://networkx.github.io/documentation/stable/tutorial.html#creating-a-graph) graph.
100 | 
101 | In order to use your own data, you have to provide the following information at least:
102 | * a dict of dict:
103 | ```
104 | 'user_id':{
105 |         'product_id':
106 |                 {
107 |                 'label': 1
108 |                 }
109 | ```
110 | * a dict of prior
111 | 
112 | You can use `dict_to networkx(graph_dict)` function from `/Utils/helper.py` file to convert your graph_dict into a networkx graph.
113 | For more details, please see `data_to_network_graph.py`.
114 | 
115 | ### The structure of code
116 | The `/UGFraud` repository is organized as follows:
117 | - `Demo/` contains the implemented models and the corresponding example code;
118 | - `Detector/` contains the basic models;
119 | - `Yelp_Data/` contains the necessary dataset files;
120 | - `Utils/` contains the every help functions.
121 | 
122 | 
123 | ## Implemented Models
124 | 
125 | | Model  | Paper  | Venue  | Reference  |
126 | |-------|--------|--------|--------|
127 | | **SpEagle** | [Collective Opinion Spam Detection: Bridging Review Networks and Metadata](https://www.andrew.cmu.edu/user/lakoglu/pubs/15-kdd-collectiveopinionspam.pdf)  | KDD 2015  | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/speagle.txt) |
128 | | **GANG** | [GANG: Detecting Fraudulent Users in Online Social Networks via Guilt-by-Association on Directed Graph](https://ieeexplore.ieee.org/document/8215519)  | ICDM 2017  | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/gang.txt)|
129 | | **fBox** | [Spotting Suspicious Link Behavior with fBox: An Adversarial Perspective](https://arxiv.org/pdf/1410.3915.pdf)  | ICDM 2014 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/fbox.txt) |
130 | | **Fraudar** | [FRAUDAR: Bounding Graph Fraud in the Face of Camouflage](https://bhooi.github.io/papers/fraudar_kdd16.pdf)  | KDD 2016 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/fraudar.txt) |
131 | | **ZooBP** | [ZooBP: Belief Propagation for Heterogeneous Networks](http://www.vldb.org/pvldb/vol10/p625-eswaran.pdf)  | VLDB 2017 | [BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/zoobp.txt)  |
132 | | **SVD** | [Singular value decomposition and least squares solutions](https://link.springer.com/content/pdf/10.1007/978-3-662-39778-7_10.pdf)  | - |[BibTex](https://github.com/safe-graph/UGFraud/blob/master/reference/svd.txt) |
133 | | **Prior** | Evaluating suspicioueness based on prior information  | - |  - |
134 | 
135 | 
136 | ## Model Comparison
137 | | Model  | Application  | Graph Type  | Model Type  |
138 | |-------|--------|--------|-------|
139 | | **SpEagle** | Review Spam | Tripartite  | MRF  |
140 | | **GANG** | Social Sybil  | Bipartite |  MRF    |
141 | | **fBox** | Social Fraudster  | Bipartite |  SVD |
142 | | **Fraudar** |  Social Fraudster | Bipartite | Dense-block  |
143 | | **ZooBP** | E-commerce Fraud | Tripartite | MRF   |
144 | | **SVD** | Dimension Reduction  | Bipartite |  SVD  |
145 | 
146 | 
147 | ## TODO List
148 | - Homogeneous graph implementation
149 | 
150 | 
151 | ## How to Contribute
152 | You are welcomed to contribute to this open-source toolbox. Currently, you can create issues or send email to [bdscsafegraph@gmail.com](mailto:bdscsafegraph@gmail.com) for inquiry.
153 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/UGFraud/Demo/demo_pre.py:
--------------------------------------------------------------------------------
  1 | from UGFraud.Utils.helper import *
  2 | import networkx as nx
  3 | import sys
  4 | import os
  5 | import random
  6 | import pickle as pkl
  7 | sys.path.insert(0, os.path.abspath('../../'))
  8 | 
  9 | 
 10 | def data_to_network_graph(data_path):
 11 |     # data source
 12 |     data_name = 'YelpChi'
 13 |     # prefix = '../Yelp_Data/' + data_name + '/'
 14 |     prefix = data_path + data_name + '/'
 15 |     metadata_filename = prefix + 'metadata.gz'
 16 |     Checksum = 'f454ce0a5f506e0be062dc8aefb76b25'
 17 |     AUTHORIZED = False
 18 | 
 19 |     # valid YelpChi data
 20 |     with gzip.open(metadata_filename, 'rb') as f:
 21 |         file_content = f.read()
 22 |         f.close()
 23 |     if Checksum == get_hash(file_content):
 24 |         AUTHORIZED = True
 25 |     else:
 26 |         print('-' * 80)
 27 |         print('The demo data is not the intact data, if you need intact data, please download from:')
 28 |         print('http://odds.cs.stonybrook.edu/yelpchi-dataset/')
 29 |         print('-' * 80)
 30 | 
 31 |     """ 
 32 |      read the graph and node priors
 33 |      user_product_graph: {'201': [('0', 1)], ... }
 34 |      product_user_graph: {'0': [('201', 1), ('202', 1), ...], ...}
 35 | 
 36 |     """
 37 |     user_product_graph, product_user_graph = read_graph_data(metadata_filename)
 38 |     user_ground_truth, review_ground_truth = create_ground_truth(user_product_graph)
 39 | 
 40 |     # load priors
 41 |     with open(prefix + 'priors.pkl', 'rb') as f:
 42 |         priors = pkl.load(f)
 43 | 
 44 |     # convert user_product_graph to dict of dict
 45 |     # graph_dict: {'201': {'0': {'rating': 1, 'label': 1, 'date': '2011-06-08'}},...}
 46 |     graph_dict = dict()
 47 |     for k, v in user_product_graph.items():
 48 |         graph_dict[k] = dict()
 49 |         for line in v:
 50 |             if line[2] == -1:
 51 |                 new_line_2 = 0
 52 |             else:
 53 |                 new_line_2 = 1
 54 |             # if demo data is not intact, generate rating randomly
 55 |             if type(line[1]) is str:
 56 |                 new_line_1 = random.choice([0, 1])
 57 |             elif line[1] >= 4:
 58 |                 new_line_1 = 1
 59 |             else:
 60 |                 new_line_1 = 2
 61 |             graph_dict[k][line[0]] = {'rating': new_line_1, 'label': new_line_2, 'date': line[3]}
 62 | 
 63 |     # put graph_dict into networkx graph
 64 |     G = dict_to_networkx(graph_dict)
 65 | 
 66 |     # we also can convert the graph into dict of dicts
 67 |     dict_of_dicts = nx.to_dict_of_dicts(G)
 68 | 
 69 |     # organize nodes' attributes, attributes must be the dict of dicts:
 70 |     # for example: {'201': {'prior': 0.1997974972380755, 'types': 'user'}, ...}
 71 |     user_node_priors = priors[0]
 72 |     node_attr = dict()
 73 |     for k, v in user_node_priors.items():
 74 |         node_attr[k] = {'prior': v, 'types': 'user', 'label': user_ground_truth[k]}
 75 |     # add nodes' new attributes to the graph
 76 |     add_attribute_to_graph(graph=G, attribute=node_attr, adding_type='node')
 77 |     prod_node_priors = priors[2]
 78 |     node_attr = dict()
 79 |     for k, v in prod_node_priors.items():
 80 |         node_attr[k] = {'prior': v, 'types': 'prod'}
 81 |     # add nodes' new attributes to the graph
 82 |     add_attribute_to_graph(graph=G, attribute=node_attr, adding_type='node')
 83 | 
 84 |     # check new attributes
 85 |     G.nodes.get('201')
 86 | 
 87 |     # organize edges'attributes, attributes must be the dict of dicts:
 88 |     # for example: {('201', '0'): {'prior': 0.35048557119705304, 'types': 'review'}, ...}
 89 |     edge_priors = priors[1]
 90 |     edge_attr = dict()
 91 |     for k, v in edge_priors.items():
 92 |         edge_attr[k] = {'prior': v, 'types': 'review'}
 93 |     # add edges' new attributes to the graph
 94 |     add_attribute_to_graph(graph=G, attribute=edge_attr, adding_type='edge')
 95 |     # check new attributes
 96 |     G.edges.get(('201', '0'))
 97 | 
 98 |     # save graph data into json
 99 |     graph_name = 'Yelp_graph_data.json'
100 |     save_graph(graph=G, graph_name=graph_name)
101 | 
102 |     # load json into graph
103 |     loaded_G = load_graph(graph_name)
104 | 
105 | 
106 | if __name__ == '__main__':
107 |     data_path = '../Yelp_Data/'
108 |     data_to_network_graph(data_path)
109 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_Fraudar.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 	'FRAUDAR: Bounding Graph Fraud in the Face of camouflage'
  3 | 	Spot fraudsters in the presence of camouflage or hijacked accounts. An algorithm that is camouflage-resistant,
  4 | 	provides upper bounds on the effectiveness of fraudsters, and the algorithm is effective in real-world data.
  5 | 	Article: https://bhooi.github.io/papers/fraudar_kdd16.pdf
  6 | """
  7 | 
  8 | from UGFraud.Utils.helper import *
  9 | from UGFraud.Detector.Fraudar import *
 10 | import copy as cp
 11 | import sys
 12 | import os
 13 | sys.path.insert(0, os.path.abspath('../../'))
 14 | 
 15 | 
 16 | def listToSparseMatrix(edgesSource, edgesDest):
 17 | 	m = max(edgesSource) + 1
 18 | 	n = max(edgesDest) + 1
 19 | 	M = sparse.coo_matrix(([1] * len(edgesSource), (edgesSource, edgesDest)), shape=(m, n))
 20 | 	M1 = M > 0
 21 | 	return M1.astype('int')
 22 | 
 23 | 
 24 | @timer
 25 | def runFraudar(graph, multiple=0):
 26 | 	new_upriors = node_attr_filter(graph, 'types', 'user', 'prior')
 27 | 	new_rpriors = edge_attr_filter(graph, 'types', 'review', 'prior')
 28 | 	# print('Start detection on the new graph with Fraudar')
 29 | 	user_to_product = {}
 30 | 	prod_to_user = {}
 31 | 	u_id_dict = node_attr_filter(graph, 'types', 'user', 'types')
 32 | 	for u_id in u_id_dict.keys():
 33 | 		if u_id not in user_to_product:
 34 | 			user_to_product[u_id] = []
 35 | 		for p_id in graph[u_id].keys():
 36 | 			if p_id not in prod_to_user:
 37 | 				prod_to_user[p_id] = []
 38 | 				user_to_product[u_id].append(p_id)
 39 | 				prod_to_user[p_id].append(u_id)
 40 | 	u_id2idx = {}
 41 | 	p_id2idx = {}
 42 | 	idx2u_id = {}
 43 | 	idx2p_id = {}
 44 | 	i = 0
 45 | 	for u_id in user_to_product.keys():
 46 | 		u_id2idx[u_id] = i
 47 | 		idx2u_id[i] = u_id
 48 | 		i += 1
 49 | 
 50 | 	i = 0
 51 | 	for p_id in prod_to_user.keys():
 52 | 		p_id2idx[p_id] = i
 53 | 		idx2p_id[i] = p_id
 54 | 		i += 1
 55 | 
 56 | 	edgesSource = []
 57 | 	edgesDest = []
 58 | 	for u_id in u_id_dict.keys():
 59 | 		for p_id in graph[u_id].keys():
 60 | 			edgesSource.append(u_id2idx[u_id])
 61 | 			edgesDest.append(p_id2idx[p_id])
 62 | 	M = listToSparseMatrix(edgesSource, edgesDest)
 63 | 	# print("finished reading data ")
 64 | 
 65 | 	if multiple == 0:
 66 | 		# detect all dense blocks 
 67 | 		res = detect_blocks(M, logWeightedAveDegree)
 68 | 	else:
 69 | 		# detect the top #multiple dense blocks
 70 | 		res = detectMultiple(M, logWeightedAveDegree, multiple)
 71 | 
 72 | 	detected_users = {}
 73 | 	weight_dict = {}
 74 | 	for lwRes in res:
 75 | 		detected_u_idx = lwRes[0][0]
 76 | 		detected_p_idx = lwRes[0][1]
 77 | 		weight = lwRes[1]
 78 | 		weight_dict[weight] = weight
 79 | 		for i in detected_u_idx:
 80 | 			uid_tmp = idx2u_id[i]
 81 | 			if uid_tmp not in detected_users.keys():
 82 | 				detected_users[uid_tmp] = weight
 83 | 
 84 | 	max_den = res[0][1]
 85 | 	min_den = res[-1][1]
 86 | 	den_interval = max_den - min_den
 87 | 
 88 | 	ranked_rpriors = [(review, new_rpriors[review]) for review in new_rpriors.keys()]
 89 | 	ranked_rpriors = sorted(ranked_rpriors, reverse=True, key=lambda x: x[1])
 90 | 	r_max, r_mean, r_min = ranked_rpriors[0][1], ranked_rpriors[int(len(ranked_rpriors) / 2)][1], ranked_rpriors[-1][1]
 91 | 	aux_rpriors = cp.deepcopy(new_rpriors)
 92 | 	for i, p in aux_rpriors.items():
 93 | 		new_rpriors[i] = (p - r_min) / (r_max - r_min)
 94 | 
 95 | 	user_density = {}
 96 | 	for u in new_upriors.keys():
 97 | 		if u in detected_users.keys():
 98 | 			user_density[u] = (detected_users[u] - min_den) / den_interval
 99 | 		else:
100 | 			user_density[u] = 1e-6
101 | 
102 | 	user_prob = {}
103 | 	review_prob = {}
104 | 	for review in new_rpriors.keys():
105 | 		review_prob.update({review: 1e-6})
106 | 		user_prob.update({review[0]: 1e-6})
107 | 	print(len(detected_users))
108 | 	print(detected_users['302'])
109 | 
110 | 	for user in detected_users.keys():
111 | 		user_prob.update({user: user_density[user]})
112 | 		for prod in graph[user].keys():
113 | 			review_prob.update({(user, prod): user_density[user]})
114 | 
115 | 	return user_prob, review_prob
116 | 
117 | 
118 | if __name__ == '__main__':
119 | 	# data source
120 | 	file_name = 'Yelp_graph_data.json'
121 | 	G = load_graph(file_name)
122 | 	review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
123 | 
124 | 	# run Fraudar on the reviews
125 | 	userBelief, reviewBelief = runFraudar(G, multiple=0)
126 | 	reviewBelief = scale_value(reviewBelief)
127 | 
128 | 	review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
129 | 	print('review AUC = {}'.format(review_AUC))
130 | 	print('review AP  = {}'.format(review_AP))
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_GANG.py:
--------------------------------------------------------------------------------
 1 | from UGFraud.Detector.GANG import *
 2 | import sys
 3 | import os
 4 | sys.path.insert(0, os.path.abspath('../../'))
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 | 	# data source
 9 | 	file_name = 'Yelp_graph_data.json'
10 | 	G = load_graph(file_name)
11 | 	user_ground_truth = node_attr_filter(G, 'types', 'user', 'label')
12 | 	review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
13 | 
14 | 	# add semi-supervised user information / threshold
15 | 	sup_per = 0.1
16 | 
17 | 	# run GANG model
18 | 	model = GANG(G, user_ground_truth, sup_per, nor_flg=True, sup_flg=False)
19 | 
20 | 	# run Linearized Belief Propagation on product-user matrix with 1000 iterations
21 | 	iteration = 1000
22 | 	model.pu_lbp(iteration)
23 | 	userBelief, _, reviewBelief = model.classify()
24 | 	reviewBelief = scale_value(reviewBelief)
25 | 
26 | 	# evaluation
27 | 	review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
28 | 	print('review AUC = {}'.format(review_AUC))
29 | 	print('review AP  = {}'.format(review_AP))
30 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_Prior.py:
--------------------------------------------------------------------------------
 1 | from Utils.helper import *
 2 | 
 3 | if __name__ == '__main__':
 4 | 	# data source
 5 | 	file_name = 'Yelp_graph_data.json'
 6 | 	G = load_graph(file_name)
 7 | 	review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
 8 | 
 9 | 	# normalize the review prior as the review suspicious belief
10 | 	rpriors = edge_attr_filter(G, 'types', 'review', 'prior')
11 | 	reviewBelief = scale_value(rpriors)
12 | 
13 | 	review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
14 | 	print('review AUC = {}'.format(review_AUC))
15 | 	print('review AP  = {}'.format(review_AP))
16 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_SVD.py:
--------------------------------------------------------------------------------
 1 | from UGFraud.Detector.SVD import *
 2 | import sys
 3 | import os
 4 | sys.path.insert(0, os.path.abspath('../../'))
 5 | 
 6 | if __name__ == '__main__':
 7 | 	# data source
 8 | 	file_name = 'Yelp_graph_data.json'
 9 | 	G = load_graph(file_name)
10 | 	user_ground_truth = node_attr_filter(G, 'types', 'user', 'label')
11 | 
12 | 	percent = 0.9
13 | 	model = SVD(G)
14 | 	svd_output = model.run(percent)
15 | 	result = model.evaluate_SVD(svd_output, G)
16 | 	index = list(map(str, map(int, result[0])))
17 | 	userBelief = dict(zip(index, result[1]))
18 | 	review_AUC, review_AP = evaluate(user_ground_truth, userBelief)
19 | 	print('review AUC = {}'.format(review_AUC))
20 | 	print('review AP  = {}'.format(review_AP))


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_SpEagle.py:
--------------------------------------------------------------------------------
 1 | from UGFraud.Detector.SpEagle import *
 2 | 
 3 | if __name__ == '__main__':
 4 | 	# data source
 5 | 	file_name = 'Yelp_graph_data.json'
 6 | 	G = load_graph(file_name)
 7 | 	review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
 8 | 
 9 | 	# input parameters: numerical_eps, eps, num_iters, stop_threshold
10 | 	numerical_eps = 1e-5
11 | 	eps = 0.1
12 | 	user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]]))
13 | 	review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]]))
14 | 	potentials = {'u_r': user_review_potential, 'r_u': user_review_potential,
15 | 	              'r_p': review_product_potential, 'p_r': review_product_potential}
16 | 	max_iters = 4
17 | 	stop_threshold = 1e-3
18 | 
19 | 	model = SpEagle(G, potentials, message=None, max_iters=4)
20 | 
21 | 	# new runbp func
22 | 	model.schedule(schedule_type='bfs')
23 | 
24 | 	iter = 0
25 | 	num_bp_iters = 2
26 | 	model.run_bp(start_iter=iter, max_iters=num_bp_iters, tol=stop_threshold)
27 | 
28 | 	userBelief, reviewBelief, _ = model.classify()
29 | 
30 | 	review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
31 | 	print('review AUC = {}'.format(review_AUC))
32 | 	print('review AP  = {}'.format(review_AP))
33 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_ZooBP.py:
--------------------------------------------------------------------------------
 1 | from UGFraud.Utils.helper import *
 2 | from UGFraud.Detector.ZooBP import *
 3 | import sys
 4 | import os
 5 | sys.path.insert(0, os.path.abspath('../../'))
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     # data source
10 |     file_name = 'Yelp_graph_data.json'
11 |     G = load_graph(file_name)
12 |     user_ground_truth = node_attr_filter(G, 'types', 'user', 'label')
13 | 
14 |     ep = 0.01
15 |     #  H: compatibility matrix
16 |     H = np.array([[0.5, -0.5], [-0.5, 0.5]])
17 | 
18 |     model = ZooBP(G, ep, H)
19 |     userBelief, _ = model.run()  # result = (user_beliefs, prod_beliefs)
20 | 
21 |     review_AUC, review_AP = evaluate(user_ground_truth, userBelief)
22 |     print('review AUC = {}'.format(review_AUC))
23 |     print('review AP  = {}'.format(review_AP))


--------------------------------------------------------------------------------
/UGFraud/Demo/eval_fBox.py:
--------------------------------------------------------------------------------
 1 | from UGFraud.Utils.helper import *
 2 | from UGFraud.Detector.fBox import *
 3 | 
 4 | 
 5 | def runfBox(graph, t, k):
 6 |     user_priors = node_attr_filter(graph, 'types', 'user', 'prior')
 7 |     review_priors = edge_attr_filter(graph, 'types', 'review', 'prior')
 8 | 
 9 |     # run fBox
10 |     model = fBox(graph)
11 |     num_detected_users = []
12 | 
13 |     detected_users_by_degree, detected_products_by_degree = model.run(t, k)
14 |     detected_users = set()
15 |     for d, user_list in detected_users_by_degree.items():
16 |         detected_users.update([u for u in user_list])
17 | 
18 |     num_detected_users.append(len(detected_users))
19 | 
20 |     detected_products = set()
21 |     for d, prod_list in detected_products_by_degree.items():
22 |         detected_products.update([p for p in prod_list])
23 | 
24 |     result_uid = []
25 |     user_prob = {}  # result_prob means user_prob
26 |     review_prob = {}
27 |     for u, v in user_priors.items():
28 |         result_uid.append(u)
29 |         if u in detected_users:
30 |             user_prob.update({u: user_priors.get(u)})
31 |         else:
32 |             user_prob.update({u: 1e-7})
33 | 
34 |     for user_prod in graph.edges:
35 |         if user_prod[0] in detected_users:
36 |             review_prob[(user_prod[0], user_prod[1])] = review_priors.get((user_prod[0], user_prod[1]))
37 |         else:
38 |             review_prob[(user_prod[0], user_prod[1])] = 0
39 | 
40 |     return user_prob, review_prob
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     # data source
45 |     file_name = 'Yelp_graph_data.json'
46 |     G = load_graph(file_name)
47 |     review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
48 | 
49 |     # important parameters
50 |     t = 20  # taus = [0.5, 1, 5, 10, 25, 50, 99]
51 |     k = 50  # k = range(10, 51, 10)
52 | 
53 |     userBelief, reviewBelief = runfBox(G, t, k)
54 | 
55 |     reviewBelief = scale_value(reviewBelief)
56 | 
57 |     review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
58 |     print('review AUC = {}'.format(review_AUC))
59 |     print('review AP  = {}'.format(review_AP))
60 | 


--------------------------------------------------------------------------------
/UGFraud/Demo/testing.py:
--------------------------------------------------------------------------------
  1 | from UGFraud.Demo.eval_fBox import *
  2 | from UGFraud.Demo.eval_Fraudar import *
  3 | from UGFraud.Demo.eval_GANG import *
  4 | from UGFraud.Demo.eval_SpEagle import *
  5 | from UGFraud.Demo.eval_SVD import *
  6 | from UGFraud.Demo.eval_ZooBP import *
  7 | from UGFraud.Demo.demo_pre import *
  8 | import sys
  9 | import os
 10 | 
 11 | sys.path.insert(0, os.path.abspath('../../'))
 12 | 
 13 | # data source
 14 | file_name = 'Yelp_graph_data.json'
 15 | try:
 16 |     G = load_graph(file_name)
 17 | except FileNotFoundError:
 18 |     data_path = '../Yelp_Data/'
 19 |     data_to_network_graph(data_path)
 20 |     G = load_graph(file_name)
 21 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label')
 22 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
 23 | 
 24 | """
 25 |     testing fBox
 26 | """
 27 | print("*" * 80)
 28 | print("Testing fBox")
 29 | t = 20  # taus = [0.5, 1, 5, 10, 25, 50, 99]
 30 | k = 50  # k = range(10, 51, 10)
 31 | serBelief, reviewBelief = runfBox(G, t, k)
 32 | reviewBelief = scale_value(reviewBelief)
 33 | 
 34 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 35 | print('review AUC = {}'.format(review_AUC))
 36 | print('review AP  = {}'.format(review_AP))
 37 | 
 38 | """
 39 |     testing Fraudar
 40 | """
 41 | print("*" * 80)
 42 | print("Testing Fraudar")
 43 | userBelief, reviewBelief = runFraudar(G, multiple=0)
 44 | reviewBelief = scale_value(reviewBelief)
 45 | 
 46 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 47 | print('review AUC = {}'.format(review_AUC))
 48 | print('review AP  = {}'.format(review_AP))
 49 | 
 50 | """
 51 |     testing GANG
 52 | """
 53 | print("*" * 80)
 54 | print("Testing GANG")
 55 | # add semi-supervised user information / threshold
 56 | sup_per = 0.1
 57 | 
 58 | # run GANG model
 59 | model = GANG(G, user_ground_truth, sup_per, nor_flg=True, sup_flg=False)
 60 | 
 61 | # run Linearized Belief Propagation on product-user matrix with 1000 iterations
 62 | iteration = 1000
 63 | model.pu_lbp(iteration)
 64 | userBelief, _, reviewBelief = model.classify()
 65 | reviewBelief = scale_value(reviewBelief)
 66 | 
 67 | # evaluation
 68 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 69 | print('review AUC = {}'.format(review_AUC))
 70 | print('review AP  = {}'.format(review_AP))
 71 | 
 72 | """
 73 |     testing Prior
 74 | """
 75 | print("*" * 80)
 76 | print("Testing Prior")
 77 | # normalize the review prior as the review suspicious belief
 78 | rpriors = edge_attr_filter(G, 'types', 'review', 'prior')
 79 | reviewBelief = scale_value(rpriors)
 80 | 
 81 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 82 | print('review AUC = {}'.format(review_AUC))
 83 | print('review AP  = {}'.format(review_AP))
 84 | 
 85 | """
 86 |     testing SpEagle
 87 | """
 88 | print("*" * 80)
 89 | print("Testing SpEagle")
 90 | # input parameters: numerical_eps, eps, num_iters, stop_threshold
 91 | numerical_eps = 1e-5
 92 | eps = 0.1
 93 | user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]]))
 94 | review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]]))
 95 | potentials = {'u_r': user_review_potential, 'r_u': user_review_potential,
 96 |               'r_p': review_product_potential, 'p_r': review_product_potential}
 97 | max_iters = 4
 98 | stop_threshold = 1e-3
 99 | 
100 | model = SpEagle(G, potentials, message=None, max_iters=4)
101 | 
102 | # new runbp func
103 | model.schedule(schedule_type='bfs')
104 | 
105 | iter = 0
106 | num_bp_iters = 2
107 | model.run_bp(start_iter=iter, max_iters=num_bp_iters, tol=stop_threshold)
108 | 
109 | userBelief, reviewBelief, _ = model.classify()
110 | 
111 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
112 | print('review AUC = {}'.format(review_AUC))
113 | print('review AP  = {}'.format(review_AP))
114 | 
115 | """
116 |     testing SVG
117 | """
118 | print("*" * 80)
119 | print("Testing SVD")
120 | percent = 0.9
121 | model = SVD(G)
122 | svd_output = model.run(percent)
123 | result = model.evaluate_SVD(svd_output, G)
124 | index = list(map(str, map(int, result[0])))
125 | userBelief = dict(zip(index, result[1]))
126 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief)
127 | print('review AUC = {}'.format(review_AUC))
128 | print('review AP  = {}'.format(review_AP))
129 | 
130 | """
131 |     testing ZooBP
132 | """
133 | print("*" * 80)
134 | print("Testing ZooBp")
135 | ep = 0.01
136 | #  H: compatibility matrix
137 | H = np.array([[0.5, -0.5], [-0.5, 0.5]])
138 | 
139 | model = ZooBP(G, ep, H)
140 | userBelief, _ = model.run()  # result = (user_beliefs, prod_beliefs)
141 | 
142 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief)
143 | print('review AUC = {}'.format(review_AUC))
144 | print('review AP  = {}'.format(review_AP))
145 | 


--------------------------------------------------------------------------------
/UGFraud/Detector/Fraudar.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  contains functions that run the greedy detector for dense regions in a sparse matrix.
  3 |  use aveDegree or sqrtWeightedAveDegree or logWeightedAveDegree on a sparse matrix,
  4 |  which returns ((rowSet, colSet), score) for the most suspicious block.
  5 | """
  6 | 
  7 | from __future__ import division
  8 | from UGFraud.Detector.MinTree import MinTree
  9 | from scipy import sparse
 10 | import random
 11 | import numpy as np
 12 | 
 13 | 
 14 | # given a list of lists where each row is an edge, this returns the sparse matrix representation of the data.
 15 | def listToSparseMatrix(edges_source, edges_des):
 16 | 	m = max(edges_source) + 1
 17 | 	n = max(edges_des) + 1
 18 | 	M = sparse.coo_matrix(([1] * len(edges_source), (edges_source, edges_des)), shape=(m, n))
 19 | 	M1 = M > 0
 20 | 	return M1.astype('int')
 21 | 
 22 | 
 23 | # reads matrix from file and returns sparse matrix. first 2 columns should be row and column indices
 24 | def readData(filename):
 25 | 	edgesSource = []
 26 | 	edgesDest = []
 27 | 	with open(filename) as f:
 28 | 		for line in f:
 29 | 			toks = line.split()
 30 | 			edgesSource.append(int(toks[0]))
 31 | 			edgesDest.append(int(toks[1]))
 32 | 	return listToSparseMatrix(edgesSource, edgesDest)
 33 | 
 34 | 
 35 | def detectMultiple(M, detectFunc, numToDetect):
 36 | 	Mcur = M.copy().tolil()
 37 | 	res = []
 38 | 	for i in range(numToDetect):
 39 | 		((rowSet, colSet), score) = detectFunc(Mcur)
 40 | 		res.append(((rowSet, colSet), score))
 41 | 		(rs, cs) = Mcur.nonzero()
 42 | 		for i in range(len(rs)):
 43 | 			if rs[i] in rowSet and cs[i] in colSet:
 44 | 				Mcur[rs[i], cs[i]] = 0
 45 | 	return res
 46 | 
 47 | 
 48 | def detect_blocks(M, detectFunc):
 49 | 	Mcur = M.copy().tolil()
 50 | 	res = []
 51 | 	while True:
 52 | 		((rowSet, colSet), score) = detectFunc(Mcur)
 53 | 		block = ((rowSet, colSet), score)
 54 | 		if len(res) > 0:
 55 | 			if abs(block[1] - res[-1][1]) < 0.01:
 56 | 				break
 57 | 		res.append(block)
 58 | 
 59 | 		(rs, cs) = Mcur.nonzero()
 60 | 		for i in range(len(rs)):
 61 | 			if rs[i] in rowSet and cs[i] in colSet:
 62 | 				Mcur[rs[i], cs[i]] = 0
 63 | 	return res
 64 | 
 65 | 
 66 | """
 67 |  Inject a clique of size m0 by n0, with density pp. the last parameter testIdx determines the camouflage type.
 68 |  testIdx = 1: random camouflage, with camouflage density set so each fraudster outputs approximately 
 69 |  equal number of fraudulent and camouflage edges
 70 |  testIdx = 2: random camouflage, with double the density as in the previous setting
 71 |  testIdx = 3: biased camouflage, more likely to add camouflage to high degree columns
 72 | """
 73 | 
 74 | 
 75 | def injectCliqueCamo(M, m0, n0, p, testIdx):
 76 | 	(m, n) = M.shape
 77 | 	M2 = M.copy().tolil()
 78 | 
 79 | 	colSum = np.squeeze(M2.sum(axis=0).A)
 80 | 	colSumPart = colSum[n0:n]
 81 | 	colSumPartPro = np.int_(colSumPart)
 82 | 	colIdx = np.arange(n0, n, 1)
 83 | 	population = np.repeat(colIdx, colSumPartPro, axis=0)
 84 | 
 85 | 	for i in range(m0):
 86 | 		# inject clique
 87 | 		for j in range(n0):
 88 | 			if random.random() < p:
 89 | 				M2[i, j] = 1
 90 | 		# inject camo
 91 | 		if testIdx == 1:
 92 | 			thres = p * n0 / (n - n0)
 93 | 			for j in range(n0, n):
 94 | 				if random.random() < thres:
 95 | 					M2[i, j] = 1
 96 | 		if testIdx == 2:
 97 | 			thres = 2 * p * n0 / (n - n0)
 98 | 			for j in range(n0, n):
 99 | 				if random.random() < thres:
100 | 					M2[i, j] = 1
101 | 		# biased camo
102 | 		if testIdx == 3:
103 | 			colRplmt = random.sample(population, int(n0 * p))
104 | 			M2[i, colRplmt] = 1
105 | 
106 | 	return M2.tocsc()
107 | 
108 | 
109 | # sum of weighted edges in rowSet and colSet in matrix M
110 | def c2Score(M, rowSet, colSet):
111 | 	return M[list(rowSet), :][:, list(colSet)].sum(axis=None)
112 | 
113 | 
114 | def jaccard(pred, actual):
115 | 	intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1]))
116 | 	unionSize = len(set.union(pred[0], actual[0])) + len(set.union(pred[1], actual[1]))
117 | 	return intersectSize / unionSize
118 | 
119 | 
120 | def getPrecision(pred, actual):
121 | 	intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1]))
122 | 	return intersectSize / (len(pred[0]) + len(pred[1]))
123 | 
124 | 
125 | def getRecall(pred, actual):
126 | 	intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1]))
127 | 	return intersectSize / (len(actual[0]) + len(actual[1]))
128 | 
129 | 
130 | def getFMeasure(pred, actual):
131 | 	prec = getPrecision(pred, actual)
132 | 	rec = getRecall(pred, actual)
133 | 	return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec))
134 | 
135 | 
136 | def getRowPrecision(pred, actual, idx):
137 | 	intersectSize = len(set.intersection(pred[idx], actual[idx]))
138 | 	return intersectSize / len(pred[idx])
139 | 
140 | 
141 | def getRowRecall(pred, actual, idx):
142 | 	intersectSize = len(set.intersection(pred[idx], actual[idx]))
143 | 	return intersectSize / len(actual[idx])
144 | 
145 | 
146 | def getRowFMeasure(pred, actual, idx):
147 | 	prec = getRowPrecision(pred, actual, idx)
148 | 	rec = getRowRecall(pred, actual, idx)
149 | 	return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec))
150 | 
151 | 
152 | # run greedy algorithm using square root column weights
153 | def sqrtWeightedAveDegree(M):
154 | 	(m, n) = M.shape
155 | 	colSums = M.sum(axis=0)
156 | 	colWeights = 1.0 / np.sqrt(np.squeeze(colSums) + 5)
157 | 	colDiag = sparse.lil_matrix((n, n))
158 | 	colDiag.setdiag(colWeights)
159 | 	W = M * colDiag
160 | 	return fastGreedyDecreasing(W, colWeights)
161 | 
162 | 
163 | # run greedy algorithm using logarithmic weights
164 | def logWeightedAveDegree(M):
165 | 	(m, n) = M.shape
166 | 	colSums = M.sum(axis=0)
167 | 	colWeights = np.squeeze(np.array(1.0 / np.log(np.squeeze(colSums) + 5)))
168 | 	colDiag = sparse.lil_matrix((n, n))
169 | 	colDiag.setdiag(colWeights)
170 | 	W = M * colDiag
171 | 	# print('finished computing weight matrix')
172 | 	return fastGreedyDecreasing(W, colWeights)
173 | 
174 | 
175 | def aveDegree(M):
176 | 	(m, n) = M.shape
177 | 	return fastGreedyDecreasing(M, [1] * n)
178 | 
179 | 
180 | def subsetAboveDegree(M, col_thres, row_thres):
181 | 	M = M.tocsc()
182 | 	(m, n) = M.shape
183 | 	colSums = np.squeeze(np.array(M.sum(axis=0)))
184 | 	rowSums = np.squeeze(np.array(M.sum(axis=1)))
185 | 	colValid = colSums > col_thres
186 | 	rowValid = rowSums > row_thres
187 | 	M1 = M[:, colValid].tocsr()
188 | 	M2 = M1[rowValid, :]
189 | 	rowFilter = [i for i in range(m) if rowValid[i]]
190 | 	colFilter = [i for i in range(n) if colValid[i]]
191 | 	return M2, rowFilter, colFilter
192 | 
193 | 
194 | # @profile
195 | def fastGreedyDecreasing(M, colWeights):
196 | 	(m, n) = M.shape
197 | 	Md = M.todok()
198 | 	Ml = M.tolil()
199 | 	Mlt = M.transpose().tolil()
200 | 	rowSet = set(range(0, m))
201 | 	colSet = set(range(0, n))
202 | 	curScore = c2Score(M, rowSet, colSet)
203 | 	bestAveScore = curScore / (len(rowSet) + len(colSet))
204 | 	bestSets = (rowSet, colSet)
205 | 	rowDeltas = np.squeeze(M.sum(axis=1).A)  # *decrease* in total weight when *removing* this row
206 | 	colDeltas = np.squeeze(M.sum(axis=0).A)
207 | 	rowTree = MinTree(rowDeltas)
208 | 	colTree = MinTree(colDeltas)
209 | 
210 | 	numDeleted = 0
211 | 	deleted = []
212 | 	bestNumDeleted = 0
213 | 
214 | 	while rowSet and colSet:
215 | 		nextRow, rowDelt = rowTree.getMin()
216 | 		nextCol, colDelt = colTree.getMin()
217 | 		if rowDelt <= colDelt:
218 | 			curScore -= rowDelt
219 | 			for j in Ml.rows[nextRow]:
220 | 				delt = colWeights[j]
221 | 				colTree.changeVal(j, -colWeights[j])
222 | 			rowSet -= {nextRow}
223 | 			rowTree.changeVal(nextRow, float('inf'))
224 | 			deleted.append((0, nextRow))
225 | 		else:
226 | 			curScore -= colDelt
227 | 			for i in Mlt.rows[nextCol]:
228 | 				delt = colWeights[nextCol]
229 | 				rowTree.changeVal(i, -colWeights[nextCol])
230 | 			colSet -= {nextCol}
231 | 			colTree.changeVal(nextCol, float('inf'))
232 | 			deleted.append((1, nextCol))
233 | 
234 | 		numDeleted += 1
235 | 		curAveScore = curScore / (len(colSet) + len(rowSet))
236 | 
237 | 		if curAveScore > bestAveScore:
238 | 			bestAveScore = curAveScore
239 | 			bestNumDeleted = numDeleted
240 | 
241 | 	# reconstruct the best row and column sets
242 | 	finalRowSet = set(range(m))
243 | 	finalColSet = set(range(n))
244 | 	for i in range(bestNumDeleted):
245 | 		if deleted[i][0] == 0:
246 | 			finalRowSet.remove(deleted[i][1])
247 | 		else:
248 | 			finalColSet.remove(deleted[i][1])
249 | 	return ((finalRowSet, finalColSet), bestAveScore)
250 | 


--------------------------------------------------------------------------------
/UGFraud/Detector/GANG.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 	'GANG: Detecting Fraudulent Users in Online Social Networks via Guilt-by-Association on Directed Graphs'
  3 | 	A guilt-by-association method on directed graphs, to detect fraudulent users in OSNs.
  4 | 	Article: http://people.duke.edu/~zg70/papers/GANG.pdf
  5 | """
  6 | 
  7 | from scipy.sparse import lil_matrix
  8 | from UGFraud.Utils.helper import *
  9 | import random
 10 | 
 11 | 
 12 | def semi_data(ground_truth, portion):
 13 | 	"""
 14 | 	produce the sampled labeled review id used for semi-supervised prior
 15 | 	:param ground_truth: dict of ground truth {uid:label} or {rid:label}
 16 | 	:param portion: portion of the labeled data
 17 | 	:return: review id which are used for supervising
 18 | 	"""
 19 | 
 20 | 	smaple_size = int(len(ground_truth) * portion * 0.5)
 21 | 	total_list = [r for r in ground_truth.keys()]
 22 | 	pos_list = []
 23 | 	neg_list = []
 24 | 	for id, label in ground_truth.items():
 25 | 		if label == 1:
 26 | 			pos_list.append(id)
 27 | 		else:
 28 | 			neg_list.append(id)
 29 | 
 30 | 	pos_sample = [pos_list[i] for i in sorted(random.sample(range(len(pos_list)), smaple_size))]
 31 | 	neg_sample = [neg_list[i] for i in sorted(random.sample(range(len(neg_list)), smaple_size))]
 32 | 
 33 | 	pos_ids = [total_list.index(s) for s in pos_sample]
 34 | 	neg_ids = [total_list.index(s) for s in neg_sample]
 35 | 
 36 | 	return pos_ids, neg_ids
 37 | 
 38 | 
 39 | class GANG:
 40 | 
 41 | 	def __init__(self, graph, user_ground_truth, sup_per, nor_flg, sup_flg=False):
 42 | 
 43 | 		# number of dimensions of product-user matrix
 44 | 		u_prior = node_attr_filter(graph, 'types', 'user', 'prior')
 45 | 		p_prior = node_attr_filter(graph, 'types', 'prod', 'prior')
 46 | 		r_prior = edge_attr_filter(graph, 'types', 'review', 'prior')
 47 | 		priors = [u_prior, r_prior, p_prior]
 48 | 		self.pu_dim = len(priors[0])+len(priors[2])
 49 | 		# spam belief prior vector
 50 | 		self.res_pu_spam_prior_vector = None
 51 | 		# diagonal matrix used for normalization
 52 | 		self.diag_pu_matrix = None
 53 | 		# product-user spam posterior belief vector
 54 | 		self.res_pu_spam_post_vector = np.zeros((self.pu_dim, 1))
 55 | 		# sparse row matrix is faster when multiply with vectors
 56 | 		self.pu_csr_matrix = None
 57 | 		self.diag_pu_csr_matrix = None
 58 | 		self.nor_pu_csr_matrix = None
 59 | 		# priors dictionary
 60 | 		self.u_priors = priors[0]
 61 | 		self.r_priors = priors[1]
 62 | 		self.p_priors = priors[2]
 63 | 		# build prior belief vector
 64 | 		p_vector, u_vector, r_vector = [], [], []
 65 | 		if nor_flg:
 66 | 			# the mean value with normalization
 67 | 			u_mean, p_mean, r_mean = 0.5, 0.5, 0.5
 68 | 		else:
 69 | 			# the mean value without normalization
 70 | 			priors, mean_priors = nor_priors(priors)
 71 | 			u_mean, r_mean, p_mean = mean_priors[0], mean_priors[1], mean_priors[2]
 72 | 
 73 | 		for u in priors[0].values():
 74 | 			u_vector.append(u)
 75 | 		for p in priors[2].values():
 76 | 			p_vector.append(p)
 77 | 
 78 | 		res_u_vector = [i-u_mean for i in u_vector]
 79 | 		res_p_vector = [i-p_mean for i in p_vector]
 80 | 
 81 | 		# add semi-supervised user information
 82 | 		if sup_flg:
 83 | 			pos_ids, neg_ids = semi_data(user_ground_truth, sup_per)
 84 | 			for iter, prob in enumerate(res_u_vector):
 85 | 				if iter in pos_ids:
 86 | 					res_u_vector[iter] = 1 - u_mean
 87 | 				elif iter in neg_ids:
 88 | 					res_u_vector[iter] = 0 - u_mean
 89 | 
 90 | 		# aggregate the prior vectors
 91 | 		res_pu_vector = res_p_vector + res_u_vector
 92 | 
 93 | 		self.res_pu_spam_prior_vector = np.c_[res_pu_vector]
 94 | 
 95 | 		# build product-user adjacency sparse matrix
 96 | 		self.pu_matrix = lil_matrix((self.pu_dim, self.pu_dim))
 97 | 
 98 | 		# create the pu diagonal matrix
 99 | 		self.diag_pu_matrix = lil_matrix((self.pu_dim, self.pu_dim))
100 | 		for id in range(0, self.pu_dim):
101 | 			if id < len(self.p_priors):
102 | 				self.diag_pu_matrix[id, id] = len(graph[str(id)])
103 | 			else:
104 | 				self.diag_pu_matrix[id, id] = len(graph[str(id)])
105 | 
106 | 		for p_id in p_prior.keys():
107 | 			for neighbor_id in graph[p_id].keys():
108 | 				self.pu_matrix[int(p_id), int(neighbor_id)] = 1
109 | 
110 | 		for u_id in u_prior.keys():
111 | 			for neighbor_id in graph[u_id].keys():
112 | 				self.pu_matrix[int(u_id), int(neighbor_id)] = 1
113 | 
114 | 	@timer
115 | 	def pu_lbp(self, max_iters):
116 | 		"""
117 | 		Run the matrix form of lbp on the product-user sparse matrix
118 | 		:return: the posterior belief vector of products and users
119 | 		"""
120 | 
121 | 		# transfer to sparse row matrix to accelerate calculation
122 | 		self.pu_csr_matrix = self.pu_matrix.tocsr()
123 | 		self.diag_pu_csr_matrix = self.diag_pu_matrix.tocsr()
124 | 
125 | 		i = 0
126 | 		while i < max_iters:
127 | 			sum_0 = np.sum(self.res_pu_spam_post_vector)
128 | 			self.res_pu_spam_post_vector = self.res_pu_spam_prior_vector + 2 * 0.008 * (self.pu_csr_matrix.dot(self.res_pu_spam_post_vector))
129 | 			sum_1 = np.sum(self.res_pu_spam_post_vector)
130 | 
131 | 			# print('iter: ' + str(i))
132 | 			# print('diff: ' + str(abs(sum_0 - sum_1)))
133 | 
134 | 			i += 1
135 | 
136 | 			if abs(sum_0 - sum_1) < 0.1:
137 | 				return abs(sum_0 - sum_1)
138 | 
139 | 	@timer
140 | 	def classify(self):
141 | 		"""
142 | 		Calculate the posterior belief of three type of nodes
143 | 		:return: u_post: users posterior beliefs, p_post: products posterior beliefs,
144 | 		r_post: reviews posterior beliefs.
145 | 		"""
146 | 		u_post = {}
147 | 		p_post = {}
148 | 		r_post = {}
149 | 		pu_post = self.res_pu_spam_post_vector
150 | 		no_prod = len(self.p_priors)
151 | 		# extract the posterior belief of users and reviews
152 | 		for i, r in enumerate(pu_post[no_prod:]):
153 | 			u_post[str(i + no_prod)] = float(r)
154 | 		for i, r in enumerate(pu_post[:no_prod]):
155 | 			p_post[str(i)] = float(r)
156 | 		for i, r in self.r_priors.items():
157 | 			r_post[i] = (u_post[i[0]] + float(r)) / 2
158 | 
159 | 		u_post = scale_value(u_post)
160 | 		p_post = scale_value(p_post)
161 | 		r_post = scale_value(r_post)
162 | 
163 | 		return u_post, p_post, r_post


--------------------------------------------------------------------------------
/UGFraud/Detector/MinTree.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  A tree data structure which stores a list of degrees and can quickly retrieve the min degree element,
 3 |  or modify any of the degrees, each in logarithmic time. It works by creating a binary tree with the
 4 |  given elements in the leaves, where each internal node stores the min of its two children.
 5 | """
 6 | 
 7 | import math
 8 | 
 9 | 
10 | class MinTree:
11 |     def __init__(self, degrees):
12 |         self.input_length = len(degrees)
13 |         self.height = int(math.ceil(math.log(len(degrees), 2)))
14 |         self.numLeaves = 2 ** self.height
15 |         self.numBranches = self.numLeaves - 1
16 |         self.n = self.numBranches + self.numLeaves
17 |         self.nodes = [float('inf')] * self.n
18 |         for i in range(len(degrees)):
19 |             self.nodes[self.numBranches + i] = degrees[i]
20 |         for i in reversed(range(self.numBranches)):
21 |             self.nodes[i] = min(self.nodes[2 * i + 1], self.nodes[2 * i + 2])
22 | 
23 |     def getMin(self):
24 |         cur = 0
25 |         for i in range(self.height):
26 |             cur = (2 * cur + 1) if self.nodes[2 * cur + 1] <= self.nodes[2 * cur + 2] else (2 * cur + 2)
27 |         # print "found min at %d: %d" % (cur, self.nodes[cur])
28 |         return (cur - self.numBranches, self.nodes[cur])
29 | 
30 |     def changeVal(self, idx, delta):
31 |         cur = self.numBranches + idx
32 |         self.nodes[cur] += delta
33 |         for i in range(self.height):
34 |             cur = (cur - 1) // 2
35 |             nextParent = min(self.nodes[2 * cur + 1], self.nodes[2 * cur + 2])
36 |             if self.nodes[cur] == nextParent:
37 |                 break
38 |             self.nodes[cur] = nextParent
39 | 
40 |     def dump(self):
41 |         print ("numLeaves: %d, numBranches: %d, n: %d, nodes: " % (self.numLeaves, self.numBranches, self.n))
42 |         cur = 0
43 |         for i in range(self.height + 1):
44 |             for j in range(2 ** i):
45 |                 print (self.nodes[cur])
46 |                 cur += 1
47 |             print ('')
48 |             
49 |     def print_leaves(self):
50 |         for i in range(self.input_length):
51 |             print (self.nodes[self.numBranches + i])
52 | 


--------------------------------------------------------------------------------
/UGFraud/Detector/SVD.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     'Singular Value Decomposition and Least Squares Solutions'
  3 |     The Singular-Value Decomposition, or SVD for short, is a matrix decomposition method for reducing
  4 |     a matrix to its constituent parts in order to make certain subsequent matrix calculations simpler.
  5 |     Article: https://link.springer.com/content/pdf/10.1007/978-3-662-39778-7_10.pdf
  6 | """
  7 | 
  8 | from UGFraud.Utils.helper import *
  9 | from sklearn import svm
 10 | from sklearn.svm import SVC
 11 | from scipy.sparse.linalg import svds
 12 | import numpy as np
 13 | 
 14 | 
 15 | class SVD:
 16 |     def __init__(self, graph):
 17 |         """set up the data
 18 |         Args:
 19 |             graph: a networkx graph
 20 |         """
 21 |         user_priors = node_attr_filter(graph, 'types', 'user', 'prior')
 22 |         prod_priors = node_attr_filter(graph, 'types', 'prod', 'prior')
 23 |         num_users = len(user_priors)
 24 |         num_products = len(prod_priors)
 25 |         self.user_prod_matrix = np.empty(shape=(num_users, num_products))
 26 | 
 27 |         # create a dict for user_index in the user list and a dict for prod_index in the product list
 28 |         self.user_index = dict()
 29 |         self.prod_index = dict()
 30 | 
 31 |         i = 0
 32 |         for u_id in user_priors.keys():
 33 |             self.user_index[u_id] = i
 34 |             i = i + 1
 35 | 
 36 |         j = 0
 37 |         for prod_id in prod_priors.keys():
 38 |             self.prod_index[prod_id] = j
 39 |             j = j + 1
 40 |         #
 41 |         for user_id in user_priors.keys():
 42 |             for p_id in graph[user_id].keys():
 43 |                 rating = graph.edges.get((user_id, p_id))['rating']
 44 |                 row = self.user_index[user_id]
 45 |                 column = self.prod_index[p_id]
 46 |                 self.user_prod_matrix[row, column] = rating
 47 | 
 48 |     @timer
 49 |     def run(self, percent):
 50 |         """
 51 |         perform SVD and return the user-product matrix in a lower dimensional space
 52 |         """
 53 |         k = int(max(np.round(min(self.user_prod_matrix.shape) * percent), 1))
 54 |         u, s, v = svds(self.user_prod_matrix, k=k)
 55 |         return u
 56 | 
 57 |     def random_split(self, graph):
 58 |         """
 59 |             Partition user nodes into training and test set randomly.
 60 |             Args:
 61 |                 user_product_graph: a dictionary, with key = user_id, value = (p_id, rating, label, time)
 62 |             Return:
 63 |                 training_user_id: a set of user id to appear in model training
 64 |         """
 65 |         pos = set()
 66 |         node_degree = {}
 67 |         user_dict = node_attr_filter(graph, 'types', 'user', 'types')
 68 |         for u_id in user_dict.keys():
 69 |             for p_id in graph[u_id].keys():
 70 |                 if graph.edges.get((u_id, p_id))['label'] == 0:
 71 |                     pos.add(u_id)
 72 |                     break
 73 |             node_degree[u_id] = len(graph[u_id])
 74 | 
 75 |         neg = set(list(user_dict.keys())) - pos
 76 | 
 77 |         # random sample positive users
 78 |         training_pos = set(np.random.choice(list(pos), int(0.5 * len(pos))).ravel())
 79 |         training_neg = set(np.random.choice(list(neg), int(0.5 * len(neg))).ravel())
 80 | 
 81 |         test_pos = pos - training_pos
 82 |         test_neg = neg - training_neg
 83 | 
 84 |         print("number of positive %d" % len(pos))
 85 |         print("number of negative %d" % len(neg))
 86 |         print("number of all users %d" % len(user_dict))
 87 | 
 88 |         return training_pos, training_neg, test_pos, test_neg
 89 | 
 90 |     def classify(self, training_data_svm, training_labels_svm, testing_data_svm, testing_labels_svm):
 91 |         clf = svm.SVC(probability=True)
 92 |         clf.fit(training_data_svm, training_labels_svm)
 93 |         SVC(C=100, tol=0.00001)
 94 |         predictions = clf.predict_proba(testing_data_svm)
 95 |         return predictions
 96 | 
 97 |     def classify_binary(self, training_data_svm, training_labels_svm, testing_data_svm, testing_labels_svm):
 98 |         clf = svm.SVC()
 99 |         clf.fit(training_data_svm, training_labels_svm)
100 |         SVC(C=100, tol=0.00001)
101 |         predictions = clf.predict(testing_data_svm)
102 |         return predictions
103 | 
104 |     def evaluate_SVD(self, svd_output, graph):
105 |         # random_split
106 |         training_pos, training_neg, test_pos, test_neg = self.random_split(graph)
107 |         training_labels = {i: +1 for i in training_pos}
108 |         training_labels.update({i: -1 for i in training_neg})
109 | 
110 |         test_labels = {i: +1 for i in test_pos}
111 |         test_labels.update({i: -1 for i in test_neg})
112 | 
113 |         training_data_svm = np.empty(shape=(len(training_labels), len(svd_output[1, :])))
114 |         training_labels_svm = np.empty(shape=(len(training_labels)))
115 |         # build training data and labels for svm
116 |         i = 0
117 |         find_training_uid = dict()
118 |         for k, v in training_labels.items():
119 |             u_index = self.user_index[k]
120 |             training_data_svm[i, :] = svd_output[u_index, :]
121 |             training_labels_svm[i] = v
122 |             find_training_uid[i] = k
123 |             i = i + 1
124 |         # build testing data and labels for svm
125 |         testing_data_svm = np.empty(shape=(len(test_labels), len(svd_output[1, :])))
126 |         testing_labels_svm = np.empty(shape=(len(test_labels)))
127 |         j = 0
128 |         find_testing_uid = np.empty(shape=(len(test_labels)))
129 |         for k, v in test_labels.items():
130 |             u_index = self.user_index[k]
131 |             testing_data_svm[j, :] = svd_output[u_index, :]
132 |             testing_labels_svm[j] = v
133 |             find_testing_uid[j] = k
134 |             j = j + 1
135 | 
136 |         probas_pred = self.classify(training_data_svm, training_labels_svm, testing_data_svm, testing_labels_svm)
137 |         result = [find_testing_uid, probas_pred[:, 0]]
138 |         return result


--------------------------------------------------------------------------------
/UGFraud/Detector/SpEagle.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 	'Collective Opinion Spam Detection: Bridging Review Networks and Metadata'
  3 | 	Utilizing clues from all metadata (text, timestamp, rating) as well as relational data (network),
  4 | 	and harness them collectively under a unified framework to spot suspicious users and reviews,
  5 | 	as well as products targeted by spam.
  6 | 	Article: https://www.andrew.cmu.edu/user/lakoglu/pubs/15-kdd-collectiveopinionspam.pdf
  7 | """
  8 | 
  9 | from UGFraud.Utils.helper import *
 10 | from heapq import *
 11 | from scipy.special import logsumexp
 12 | import pickle
 13 | 
 14 | 
 15 | class myTuple():
 16 | 	def __init__(self, cost, node_id):
 17 | 		self._cost = cost
 18 | 		self._id = node_id
 19 | 
 20 | 	def __lt__(self, other):
 21 | 		return self._cost < other._cost
 22 | 
 23 | 
 24 | class Node(object):
 25 | 	""" a Node object represents a node on the graph (which is also a random variable).
 26 | 
 27 | 	Attributes:
 28 | 		_name: node's ID. a string
 29 | 		_type: a string denoting the type of the node (User, Review, Product)
 30 | 		_prior: the node's prior distribution (\phi)
 31 | 		_num_classes: number of classes, which is also the length of the prior vector.
 32 | 		_outgoing: a dictionary of out-going messages to its neighbors (key: j, value: m_{i\to j})
 33 | 		where i is the current node and j is the target node.
 34 | 		_neighbors: a list of references to its neighbors
 35 | 	"""
 36 | 
 37 | 	def __init__(self, name, prior, node_type):
 38 | 		""" Create the attributes
 39 | 		Args:
 40 | 			name: a string id of this node.
 41 | 			prior: a floating number between [0,1] representing P(y=1 | node)
 42 | 			node_type: 'u', 'p' or 'r'
 43 | 		"""
 44 | 		# to prevent log 0
 45 | 		self._eps = 1e-5
 46 | 
 47 | 		# node id (such as a u_id, p_id or review_id)
 48 | 		self._name = name
 49 | 
 50 | 		# list of names (such as u_id, p_id, and review_id) of the neighboring nodes
 51 | 		self._neighbors = []
 52 | 
 53 | 		# a dictionary with key = neighboring node id, value = np.array() representing the message
 54 | 		# from this node to the neighbor
 55 | 		self._outgoing = {}
 56 | 
 57 | 		# prior in log space, with check on 0's
 58 | 		if prior == 1:
 59 | 			prior = 1 - self._eps
 60 | 		elif prior == 0:
 61 | 			prior = self._eps
 62 | 
 63 | 		self._prior = np.log(
 64 | 			np.array([1 - prior, prior]))  # previous version: self._prior = np.log(np.array([1-prior, prior]))
 65 | 
 66 | 		self._num_classes = 2
 67 | 
 68 | 		self._type = node_type
 69 | 
 70 | 	# if self._type == 'p' and self._name == 'p0':
 71 | 	#	print ('product %s initialized.' % self._name)
 72 | 	#	print (self._outgoing)
 73 | 
 74 | 	def add_neighbor(self, neighbor_node_id):
 75 | 		"""
 76 | 			add a neighboring node to this node; create out-going message from this node to the neighbor
 77 | 			Args:
 78 | 				neighbor_node_id: a string representing the neighbor's id
 79 | 		"""
 80 | 		self._neighbors.append(neighbor_node_id)
 81 | 		self._outgoing[neighbor_node_id] = np.zeros(self._num_classes)
 82 | 
 83 | 	def add_local_neighbor(self, neighbor_node_id, message):
 84 | 		"""
 85 | 			add a neighboring node to this node to build local graph; copy out-going message from the global graph
 86 | 			Args:
 87 | 				neighbor_node_id: a string representing the neighbor's id
 88 | 				message: the message from this node to its neighbor
 89 | 		"""
 90 | 		# find the message to the corresponding neighbor_node_id
 91 | 
 92 | 		for m in message:
 93 | 			if neighbor_node_id in m.keys():
 94 | 				message_to_neighbor = m[neighbor_node_id]
 95 | 				break
 96 | 
 97 | 		self._neighbors.append(neighbor_node_id)
 98 | 		self._outgoing[neighbor_node_id] = message_to_neighbor
 99 | 
100 | 	def init_outgoing(self):
101 | 		"""
102 | 			Initialize all messages to 0.
103 | 		"""
104 | 		# a dictionary: key = neighbor id, value = np.ndarray of uniform distributions
105 | 		#self._outgoing = {n: np.zeros(self._num_classes) for n in self._neighbors}
106 | 		for n in self._neighbors:
107 | 			self._outgoing[n].fill(0.0)
108 | 
109 | 	# if self._type == 'p' and self._name == 'p0':
110 | 	#	print (self._outgoing)
111 | 
112 | 	def n_edges(self):
113 | 		return len(self._neighbors)
114 | 
115 | 	def get_name(self):
116 | 		return self._name
117 | 
118 | 	def get_type(self):
119 | 		return self._type
120 | 
121 | 	def get_prior(self):
122 | 		"""" return the prior of the node in prob space """
123 | 		return np.exp(self._prior)
124 | 
125 | 	def get_neighbors(self):
126 | 		return self._neighbors
127 | 
128 | 	def get_outgoing(self):
129 | 		return self._outgoing
130 | 
131 | 	def get_message_for(self, neighbor_name):
132 | 		""" find the message sent from this node to the neighbor specified by neighbor_name """
133 | 
134 | 		# note that _outgoing is a dictionary with key = neighbor id and value = messages
135 | 		# print(neighbor_name)
136 | 		assert neighbor_name in self._outgoing, "the neighbor %s is not a neighbor of the node %s\n" % (
137 | 		neighbor_name, self._name)
138 | 
139 | 		return self._outgoing[neighbor_name]
140 | 
141 | 	def get_belief(self, all_nodes):
142 | 		""" return the belief of the node, along with the messages used to compute the belief
143 | 			Args:
144 | 				all_nodes: a dictionary containing all nodes on the graph
145 | 			Return:
146 | 				belief:
147 | 				incoming:
148 | 		"""
149 | 
150 | 		incoming = []
151 | 
152 | 		# log 1 = 0
153 | 		belief = np.zeros(self._num_classes)
154 | 
155 | 		# add log of phi
156 | 		belief += self._prior
157 | 
158 | 		# go through each neighbor of the node
159 | 		for node_id in self._neighbors:
160 | 			# get the message sent from the neighbor n to the current node (self._name)
161 | 
162 | 			# look up the neighboring node in all_nodes
163 | 			n = all_nodes[node_id]
164 | 
165 | 			# getting message from the neighboring node to this node
166 | 			# consider working in the log scale to prevent underflowing
167 | 
168 | 			# sum log m_ij
169 | 			belief += n.get_message_for(self._name)
170 | 
171 | 			# in the same order as self._neighbors
172 | 			incoming.append(n.get_message_for(self._name))
173 | 		# print (n.get_message_for(self._name))
174 | 
175 | 		return belief, incoming
176 | 
177 | 	def recompute_outgoing(self, potentials, all_nodes, normalize=True):
178 | 		""" for each neighbor j, update the message sent to j
179 | 
180 | 			Args:
181 | 				potentials: a dictionary (key = edge type, value = log of potential matrix).
182 | 					An edge type is src_type + "_" + dst_type
183 | 				all_nodes: same as that in get_belief
184 | 
185 | 			Return:
186 | 				difference between previous and updated messages.
187 | 		"""
188 | 		# return value
189 | 		diff = 0
190 | 
191 | 		# the messages in incoming is in the same order of self._neighbors
192 | 		# total = log phi_i + sum_{j~i} log m_ji
193 | 		# incoming = [log m_ji]
194 | 		total, incoming = self.get_belief(all_nodes)
195 | 
196 | 		# go through each neighbor of the node
197 | 		for j, n_id in enumerate(self._neighbors):
198 | 
199 | 			n = all_nodes[n_id]
200 | 
201 | 			log_m_i = total - incoming[j]
202 | 
203 | 			# note that the potential matrix depends on the edge type (write(user, review) or belong(review, product))
204 | 			# edge_type can be (user-review), (review-product), (review-user) and (product-review)
205 | 			edge_type = self._type + '_' + n._type
206 | 
207 | 			# log H, where H is symmetric and there is no need to transpose it
208 | 			log_H = potentials[edge_type]
209 | 
210 | 			log_m_ij = logsumexp(log_H + np.tile(log_m_i.transpose(), (2, 1)), axis=1)
211 | 
212 | 			# normalize the message
213 | 			log_Z = logsumexp(log_H + np.tile(log_m_i.transpose(), (2, 1)))
214 | 
215 | 			log_m_ij -= log_Z#
216 | 
217 | 			# accumulate the difference
218 | 			diff += np.sum(np.abs(self._outgoing[n._name] - log_m_ij))
219 | 
220 | 			# set the message from i to j
221 | 			self._outgoing[n._name] = log_m_ij
222 | 		return diff
223 | 
224 | 
225 | class SpEagle:
226 | 	def __init__(self, graph, potentials, message=None, max_iters=1):
227 | 		""" set up the data and parameters.
228 | 
229 | 		Args:
230 | 			graph: a networkx graph
231 | 
232 | 			potentials: a dictionary (key = edge_type, value=np.ndarray)
233 | 		"""
234 | 
235 | 		self._potentials = potentials
236 | 		self._max_iters = max_iters
237 | 		self._message = message
238 | 
239 | 		self._user_priors = node_attr_filter(graph, 'types', 'user', 'prior')
240 | 		self._product_priors = node_attr_filter(graph, 'types', 'prod', 'prior')
241 | 		self._review_priors = edge_attr_filter(graph, 'types', 'review', 'prior')
242 | 
243 | 		# create nodes on the graph. key = u_id / p_id / review_id, value = node
244 | 		self._nodes = {}
245 | 
246 | 		self._bp_schedule = []
247 | 
248 | 		# add nodes and edges to build the graph
249 | 		for u_id in self._user_priors.keys():
250 | 			unique_u_id = 'u' + u_id
251 | 
252 | 			# prior in log scale
253 | 			self._nodes[unique_u_id] = Node(unique_u_id, self._user_priors[u_id], 'u')
254 | 
255 | 			# go through the reviews posted by the user
256 | 			for p_id in graph[u_id].keys():
257 | 				unique_p_id = 'p' + p_id
258 | 
259 | 				if unique_p_id not in self._nodes:
260 | 					self._nodes[unique_p_id] = Node(unique_p_id, self._product_priors[p_id], 'p')
261 | 
262 | 				review_id = (u_id, p_id)
263 | 				unique_review_id = (unique_u_id, unique_p_id)
264 | 
265 | 				if unique_review_id not in self._nodes:
266 | 					review_node = Node(unique_review_id, self._review_priors[review_id], 'r')
267 | 
268 | 					# add connections and out-going messages if the graph is a global graph
269 | 					if self._message is None:
270 | 						review_node.add_neighbor(unique_u_id)
271 | 						review_node.add_neighbor(unique_p_id)
272 | 						self._nodes[unique_u_id].add_neighbor(unique_review_id)
273 | 						self._nodes[unique_p_id].add_neighbor(unique_review_id)
274 | 					else:
275 | 					# add connections and out-going messages if the graph is a local graph
276 | 						review_node.add_local_neighbor(unique_u_id, message[unique_review_id])
277 | 						review_node.add_local_neighbor(unique_p_id, message[unique_review_id])
278 | 						self._nodes[unique_u_id].add_local_neighbor(unique_review_id, message[unique_u_id])
279 | 						self._nodes[unique_p_id].add_local_neighbor(unique_review_id, message[unique_p_id])
280 | 
281 | 					self._nodes[unique_review_id] = review_node
282 | 
283 | 	def add_new_data(self, new_user_product_graph, new_priors):
284 | 		"""
285 | 		Add new a new users-review-products sub-graph to the global existing graph.
286 | 		Need to be very careful as we don't want to mess up with existing structures and information
287 | 
288 | 		:param new_user_product_graph: same format as the user_product_graph argument in __init__
289 | 		:param new_priors: same format as the priors argument in __init__
290 | 		:return: None
291 | 		"""
292 | 		new_u_priors = new_priors[0]
293 | 		new_p_priors = new_priors[2]
294 | 		new_r_priors = new_priors[1]
295 | 
296 | 		for u_id, reviews in new_user_product_graph.items():
297 | 			unique_u_id = 'u' + u_id
298 | 			if unique_u_id not in self._nodes:
299 | 				self._user_priors[u_id] = new_u_priors[u_id]
300 | 				self._nodes[unique_u_id] = Node(unique_u_id, self._user_priors[u_id], 'u')
301 | 
302 | 			# go through the reviews posted by the user
303 | 			for t in reviews:
304 | 				p_id = t[0]
305 | 				unique_p_id = 'p' + p_id
306 | 
307 | 				if unique_p_id not in self._nodes:
308 | 					self._product_priors[p_id] = new_p_priors[p_id]
309 | 					self._nodes[unique_p_id] = Node(unique_p_id, self._product_priors[p_id], 'p')
310 | 
311 | 				review_id = (u_id, p_id)
312 | 				unique_review_id = (unique_u_id, unique_p_id)
313 | 
314 | 				if unique_review_id not in self._nodes:
315 | 					self._review_priors[review_id] = new_r_priors[review_id]
316 | 					review_node = Node(unique_review_id, self._review_priors[review_id], 'r')
317 | 
318 | 					# add connections and out-going messages
319 | 					review_node.add_neighbor(unique_u_id)
320 | 					review_node.add_neighbor(unique_p_id)
321 | 					self._nodes[unique_u_id].add_neighbor(unique_review_id)
322 | 					self._nodes[unique_p_id].add_neighbor(unique_review_id)
323 | 					self._nodes[unique_review_id] = review_node
324 | 
325 | 	def safe_log(self, array, eps=1e-5):
326 | 		""" element-wise log the given array with smoothing worrying zeros
327 | 		"""
328 | 		return np.log((array + eps) / np.sum(array + eps))
329 | 
330 | 	def output_graph(self):
331 | 		"""
332 | 			output nodes, edges, priors and potentials
333 | 		"""
334 | 		for n in self._nodes.values():
335 | 			print(str(n.get_name()) + ": " + n.get_type())
336 | 			print(n.get_prior())
337 | 			print(n.get_neighbors())
338 | 
339 | 	def schedule(self, schedule_type='bfs'):
340 | 		""" use breadth-first-search to create a BP schedule
341 | 		:param:
342 | 			schedule_type: 'bfs' or 'degree'
343 | 		:return:
344 | 		"""
345 | 
346 | 		# sort nodes in descending order of their degrees
347 | 		items = [(n.get_name(), n.n_edges()) for k, n in self._nodes.items()]
348 | 		items = sorted(items, key=lambda x: x[1], reverse=True)
349 | 
350 | 		if schedule_type == 'degree':
351 | 			self._bp_schedule = [name for name, _ in items]
352 | 			return
353 | 
354 | 		mark = set(self._nodes.keys())
355 | 		self._bp_schedule = []
356 | 
357 | 		head = 0
358 | 		tail = -1
359 | 
360 | 		# uncomment this for loop to get bfs + degree
361 | 		for node_id, _ in items:
362 | 		# uncomment this for loop to get regular bfs
363 | 		# for node_id, _ in self._nodes.items():
364 | 			node = self._nodes[node_id]
365 | 			# newly-found connected component
366 | 			if node_id in mark:
367 | 				tail += 1
368 | 				self._bp_schedule.append(node_id)
369 | 				mark.remove(node_id)
370 | 
371 | 				# search starting from i
372 | 				while head <= tail:
373 | 					cur_node = self._nodes[self._bp_schedule[head]]
374 | 					head += 1
375 | 					for neighbor_id in cur_node._neighbors:
376 | 						if neighbor_id in mark:
377 | 							tail += 1
378 | 							self._bp_schedule.append(neighbor_id)
379 | 							mark.remove(neighbor_id)
380 | 
381 | 	def local_schedule(self, starting_nodes, num_hops):
382 | 		"""
383 | 		Use Dijkstra to find nodes that are num_hops away from the starting nodes
384 | 		:param starting_nodes: the nodes considered the "source"
385 | 		:param num_hops: how far away to go
386 | 		:return:
387 | 		"""
388 | 		# node searched so far
389 | 		seen = set()
390 | 		# minimum distance of each seen node to the source nodes
391 | 		min_costs = {}
392 | 		# a priority queue of (cost, node_id)
393 | 		q = []
394 | 
395 | 		# initialize the queue to contain the starting nodes
396 | 		for node_id in starting_nodes:
397 | 			q.append(myTuple(0, node_id))
398 | 			min_costs[node_id] = 0
399 | 
400 | 		heapify(q)
401 | 		self._bp_schedule = []
402 | 		while q:
403 | 			tuple = heappop(q)
404 | 			v = tuple._id
405 | 			cost = tuple._cost
406 | 
407 | 			# if the node has cost no greater than num_hops, include it in the update schedule
408 | 			if cost <= num_hops:
409 | 				self._bp_schedule.append(v)
410 | 
411 | 			if v not in seen:
412 | 				# now the node v has its shortest distance to the starting nodes.
413 | 				seen.add(v)
414 | 				cur_node = self._nodes[v]
415 | 				for n in cur_node._neighbors:
416 | 					if n not in seen:
417 | 						prev = min_costs.get(n, None)
418 | 						next = cost + 1
419 | 						if prev is None or next < prev:
420 | 							min_costs[n] = next
421 | 							heappush(q, myTuple(next, n))
422 | 		return None
423 | 
424 | 	@timer
425 | 	def run_bp(self, start_iter=0, max_iters=-1, early_stop_at=1, tol=1e-3):
426 | 		""" run belief propagation on the graph for MaxIters iterations
427 | 		Args:
428 | 			start_iter: continuing from the results of previous iterations
429 | 			max_iters: how many iterations to run BP. Default use the SpEagle's parameter
430 | 			early_stop_at: the percentage of nodes whose out-going messages will be updated
431 | 			tol: threshold of message differences of one iteration, below which exit BP
432 | 		Return:
433 | 			delta: the difference in messages before and after iterations of message passing
434 | 		"""
435 | 		stop_at = int(len(self._bp_schedule) * early_stop_at)
436 | 
437 | 		if max_iters == -1:
438 | 			max_iters = self._max_iters
439 | 
440 | 		for it in range(start_iter, start_iter + max_iters, 1):
441 | 			if it % 2 == 0:
442 | 				start = stop_at - 1
443 | 				end = -1
444 | 				step = -1
445 | 			else:
446 | 				start = 0
447 | 				end = stop_at
448 | 				step = 1
449 | 			p = start
450 | 			total_updates = 0
451 | 			delta = 0
452 | 			while p != end:
453 | 				total_updates += 1
454 | 				cur_node = self._nodes[self._bp_schedule[p]]
455 | 				p += step
456 | 				delta += cur_node.recompute_outgoing(self._potentials, self._nodes)
457 | 				if total_updates > stop_at:
458 | 					break
459 | 			delta /= total_updates
460 | 			# print('bp_iter = %d, delta = %f\n' % (it, delta))
461 | 			if abs(delta) < tol:
462 | 				break
463 | 		return delta
464 | 
465 | 	@timer
466 | 	def classify(self):
467 | 		""" read out the id of the maximal entry of each belief vector
468 | 		Return:
469 | 			userBelief: beliefs of the users
470 | 			reviewBelief: beliefs of the reviews
471 | 			prodBelief: beliefs of the products
472 | 		"""
473 | 		userBelief= {}
474 | 		reviewBelief= {}
475 | 		prodBelief= {}
476 | 
477 | 		for k, n in self._nodes.items():
478 | 			# decide the type of the node and find its original name
479 | 			node_type = None
480 | 			if isinstance(k, tuple):
481 | 				node_type = 'review'
482 | 				u_id = k[0][1:]
483 | 				p_id = k[1][1:]
484 | 				review_id = (u_id, p_id)
485 | 			else:
486 | 				if k[0] == 'u':
487 | 					node_type = 'user'
488 | 					user_id = k[1:]
489 | 				else:
490 | 					node_type = 'product'
491 | 					prod_id = k[1:]
492 | 
493 | 			belief, _ = n.get_belief(self._nodes)
494 | 
495 | 			# from log scale to prob scale and normalize to prob distribution
496 | 			posterior_med = np.exp(belief)
497 | 			posterior = posterior_med / np.sum(posterior_med)
498 | 
499 | 			if node_type == 'review':
500 | 				reviewBelief[review_id] = posterior[1]
501 | 			elif node_type == 'user':
502 | 				userBelief[user_id] = posterior[1]
503 | 			elif node_type == 'product':
504 | 				prodBelief[prod_id] = posterior[1]
505 | 			else:
506 | 				continue
507 | 
508 | 		return userBelief, reviewBelief, prodBelief
509 | 
510 | 
511 | if __name__ == '__main__':
512 | 	prefix = '/Users/dozee/Desktop/Reseach/Spam_Detection/Dataset/YelpChi/'
513 | 	metadata_filename = prefix + 'metadata.gz'
514 | 
515 | 	# prior file names
516 | 	user_prior_filename = prefix + 'UserPriors.pickle'
517 | 	prod_prior_filename = prefix + 'ProdPriors.pickle'
518 | 	review_prior_filename = prefix + 'ReviewPriors.pickle'
519 | 
520 | 	# read the graph and node priors
521 | 	user_product_graph, product_user_graph = read_graph_data(metadata_filename)
522 | 
523 | 	with open(user_prior_filename, 'rb') as f:
524 | 		user_priors = pickle.load(f)
525 | 
526 | 	with open(prod_prior_filename, 'rb') as f:
527 | 		prod_priors = pickle.load(f)
528 | 
529 | 	with open(review_prior_filename, 'rb') as f:
530 | 		review_priors = pickle.load(f)
531 | 
532 | 	# print(user_priors)
533 | 	# set up edge potentials
534 | 	'''
535 | 	User and Review potential
536 | 		[1,0]
537 | 		[0,1]
538 | 	Reviewer and Review potential
539 | 		[1 - eps, eps]
540 | 		[eps, 1 - eps]
541 | 	'''
542 | 	numerical_eps = 1e-5
543 | 	user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]]))
544 | 	eps = 0.1
545 | 	review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]]))
546 | 
547 | 	potentials = {'u_r': user_review_potential, 'r_u': user_review_potential,
548 | 				  'r_p': review_product_potential, 'p_r': review_product_potential}
549 | 
550 | 	model = SpEagle(user_product_graph, [user_priors, prod_priors, review_priors], potentials, max_iters=100)
551 | 	model.schedule()
552 | 	model.run_bp()
553 | 
554 | 


--------------------------------------------------------------------------------
/UGFraud/Detector/ZooBP.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     ZooBP: Belief Propagation for Heterogeneous Networks.
  3 |     A method to perform fast BP on undirected heterogeneous graphs with provable convergence guarantees.
  4 |     Article: http://www.vldb.org/pvldb/vol10/p625-eswaran.pdf
  5 | """
  6 | 
  7 | from UGFraud.Utils.helper import timer
  8 | from scipy.special import logsumexp
  9 | from scipy import sparse
 10 | from collections import defaultdict
 11 | import numpy as np
 12 | import networkx as nx
 13 | 
 14 | 
 15 | def Initialize_Final_Beliefs(N1, N2, m):
 16 |     """
 17 |         Initialization of final beliefs
 18 |         Args:
 19 |             N1: number of users
 20 |             N2: number of products
 21 |             m: coefficient for reduction in beliefs
 22 |         Returns:
 23 |             Concatenation of initialized final beliefs for users and products
 24 |         Example of return values: -0.5 0.5 -0.3 0.3 ...
 25 |     """
 26 |     r1 = m * (np.random.uniform(size=N1) - 0.5)
 27 |     r1 = r1.reshape(r1.shape[0], 1)
 28 |     r2 = m * (np.random.uniform(size=N2) - 0.5)
 29 |     r2 = r2.reshape(r2.shape[0], 1)
 30 |     B1 = np.concatenate((r1, -r1), axis=1)
 31 |     B2 = np.concatenate((r2, -r2), axis=1)
 32 | 
 33 |     temp1_B = B1.reshape((B1.shape[1] * B1.shape[0], 1))
 34 |     temp2_B = B2.reshape((B2.shape[1] * B2.shape[0], 1))
 35 |     B = np.concatenate((temp1_B, temp2_B), axis=0)
 36 | 
 37 |     return B
 38 | 
 39 | 
 40 | class ZooBP:
 41 |     def __init__(self, graph, ep, H):
 42 |         """
 43 |             implementation of ZooBP in python
 44 |             Args:
 45 |                 graph: a networkx graph
 46 |                 ep: interaction strength
 47 |                 H: compatibility matrix
 48 |             Returns:
 49 |                 final_user_beliefs: centered version of final user beliefs
 50 |                 final_prod_beliefs centered version of final prod beliefs
 51 |             NOTE:
 52 |                 ZooBP requires consecutive ids not ids with gaps
 53 |         """
 54 |         a_list_temp = nx.get_edge_attributes(graph, 'rating')
 55 |         n, p = list(zip(*list(a_list_temp.keys())))
 56 |         reversed_dict = defaultdict(list)
 57 |         node_types_index = nx.get_node_attributes(graph, 'types')
 58 |         for key, value in node_types_index.items():
 59 |             reversed_dict[value].append(key)
 60 |         self.a_list = np.array(list(zip(n, p, a_list_temp.values())), dtype=np.int32)
 61 |         u_priors = dict()
 62 |         p_priors = dict()
 63 |         node_prior_index = nx.get_node_attributes(graph, 'prior')
 64 |         for i in reversed_dict['user']:
 65 |             u_priors[i] = node_prior_index[i]
 66 |         for i in reversed_dict['prod']:
 67 |             p_priors[i] = node_prior_index[i]
 68 |         self.u_tag, user_priors = zip(*u_priors.items())
 69 |         self.u_priors = np.array(user_priors)
 70 |         self.p_tag, prod_priors = zip(*p_priors.items())
 71 |         self.p_priors = np.array(prod_priors)
 72 |         self.ep = ep
 73 |         self.H = H
 74 | 
 75 |     @timer
 76 |     def run(self):
 77 |         # converts the given priors to the centered version
 78 |         user_priors = self.u_priors - 0.5 * np.ones((self.u_priors.shape[0]))
 79 |         prod_priors = self.p_priors - 0.5 * np.ones((self.p_priors.shape[0]))
 80 |         # finds positive (1) and negative (2) edges and reshapes them
 81 |         rating = self.a_list[:, 2]
 82 |         self.a_list[self.a_list[:, 2] == 2] = 2
 83 |         self.a_list[self.a_list[:, 2] == 1] = 1
 84 |         edges_pos = self.a_list[rating == 1]
 85 |         edges_neg = self.a_list[rating == 2]
 86 |         Lpos = edges_pos[:, 0:2]
 87 |         Lpos = Lpos.reshape((edges_pos.shape[0], 2))
 88 |         Lneg = edges_neg[:, 0:2]
 89 |         Lneg = Lneg.reshape((edges_neg.shape[0], 2))
 90 |         n_user = user_priors.shape[0]
 91 |         n_prod = prod_priors.shape[0]
 92 | 
 93 |         # computes A+ and A- as defined in section 4.7 of ZooBP
 94 |         lpos_0 = Lpos[:, 0] - np.ones(Lpos[:, 0].shape[0])
 95 |         lpos_1 = Lpos[:, 1] - np.ones(Lpos[:, 1].shape[0])
 96 |         Apos = sparse.coo_matrix((np.ones(Lpos.shape[0]), (lpos_0, lpos_1)), shape=(n_user, n_prod))
 97 |         lneg_0 = Lneg[:, 0] - np.ones(Lneg[:, 0].shape[0])
 98 |         lneg_1 = Lneg[:, 1] - np.ones(Lneg[:, 1].shape[0])
 99 |         Aneg = sparse.coo_matrix((np.ones(len(Lneg)), (lneg_0, lneg_1)), shape=(n_user, n_prod))
100 | 
101 |         # prior beliefs are reshaped so that user1_belief 1-user1_belief ... prod1_belief 1-prod1_belief
102 |         h_user_priors = np.reshape(user_priors, (len(user_priors), -1))
103 |         h_prod_priors = np.reshape(prod_priors, (len(prod_priors), -1))
104 |         user_priors = np.hstack((h_user_priors, -h_user_priors))
105 |         prod_priors = np.hstack((h_prod_priors, -h_prod_priors))
106 |         reshape_u = user_priors.reshape((2 * n_user, 1))
107 |         reshape_p = prod_priors.reshape((2 * n_prod, 1))
108 |         E = np.concatenate((reshape_u, reshape_p))
109 | 
110 |         # build P defined under section 4.7 of ZooBP
111 |         R = sparse.kron(Apos - Aneg, self.ep * self.H)
112 |         sp1 = sparse.coo_matrix((2 * n_user, 2 * n_user), dtype=np.int8)
113 |         temp1 = sparse.hstack([sp1, 0.5 * R])
114 |         sp2 = sparse.coo_matrix((2 * n_prod, 2 * n_prod), dtype=np.int8)
115 |         temp2 = sparse.hstack([0.5 * R.transpose(), sp2])
116 |         P = sparse.vstack((temp1, temp2))
117 |         P = P.transpose()
118 | 
119 |         # build Q defined under section 4.7 of ZooBP
120 |         sum_temp = Apos + Aneg
121 |         temp1 = sum_temp.sum(axis=1)
122 |         temp2 = sum_temp.sum(axis=0)
123 |         D12 = sparse.diags(np.asarray(temp1.flatten()).reshape(-1))
124 |         D21 = sparse.diags(np.asarray(temp2.flatten()).reshape(-1))
125 |         temp = 0.25 * self.ep * self.ep * sparse.kron(D12, self.H)
126 |         Q_1 = sparse.eye(n_user * 2) + temp
127 |         Q_2 = sparse.eye(n_prod * 2) + (0.25 * self.ep * self.ep) * (sparse.kron(D21, self.H))
128 |         sp1 = sparse.coo_matrix((n_user * 2, n_prod * 2), dtype=np.int8)
129 |         Q_temp1 = sparse.hstack((Q_1, sp1))
130 |         sp2 = sparse.coo_matrix((n_prod * 2, n_user * 2), dtype=np.int8)
131 |         Q_temp2 = sparse.hstack((sp2, Q_2))
132 |         Q = sparse.vstack((Q_temp1, Q_temp2))
133 | 
134 |         # M
135 |         M = P - Q + sparse.eye(2 * (n_user + n_prod))
136 |         M = M.transpose()
137 |         B = Initialize_Final_Beliefs(n_user, n_prod, 0.001)
138 | 
139 |         # Iterative Solution
140 |         res = 1
141 |         while (res > 1e-8):
142 |             Bold = B
143 |             # Equations (13) and (14) in ZooBP
144 |             B = E + logsumexp(M * Bold)
145 |             res = np.sum(np.sum(abs(Bold - B)))
146 | 
147 |         B1 = B[0:2 * n_user, :]
148 |         B2 = B[2 * n_user:, :]
149 |         user_beliefs = B1.reshape((n_user, 2))
150 |         user_beliefs = dict(zip(self.u_tag, user_beliefs[:, 0]))
151 |         prod_beliefs = B2.reshape((n_prod, 2))
152 |         prod_beliefs = dict(zip(self.p_tag, prod_beliefs[:, 0]))
153 | 
154 |         return user_beliefs, prod_beliefs
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/UGFraud/Detector/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/UGFraud/Detector/fBox.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 	'Spotting Suspicious Link Behavior with fBox: An Adversarial Perspective.'
  3 | 	An algorithm designed to catch small-scale, stealth attacks that slip below the radar.
  4 | 	Article: https://arxiv.org/pdf/1410.3915.pdf
  5 | """
  6 | 
  7 | from UGFraud.Utils.helper import timer
  8 | from numpy.linalg import *
  9 | from scipy.sparse import csr_matrix
 10 | from scipy.sparse.linalg import svds
 11 | import numpy as np
 12 | 
 13 | 
 14 | class fBox():
 15 | 	def __init__(self, graph):
 16 | 		"""
 17 | 			fBox only takes a binary user-product graph
 18 | 			graph: a networkx graph
 19 | 		"""
 20 | 		self.u_id2idx = {}
 21 | 		self.idx2u_id = {}
 22 | 		self.p_id2idx = {}
 23 | 		self.idx2p_id = {}
 24 | 
 25 | 		# construct a sparse matrix from the graph
 26 | 		row_idx = []
 27 | 		col_idx = []
 28 | 		data = []
 29 | 
 30 | 		user_idx = 0
 31 | 		product_idx = 0
 32 | 		for k in graph.edges():
 33 | 			if k[0] not in self.u_id2idx:
 34 | 				self.u_id2idx[k[0]] = user_idx
 35 | 				self.idx2u_id[user_idx] = k[0]
 36 | 				user_idx += 1
 37 | 
 38 | 			if k[1] not in self.p_id2idx:
 39 | 				self.p_id2idx[k[1]] = product_idx
 40 | 				self.idx2p_id[product_idx] = k[1]
 41 | 				product_idx += 1
 42 | 
 43 | 			row_idx.append(self.u_id2idx[k[0]])
 44 | 			col_idx.append(self.p_id2idx[k[1]])
 45 | 			data.append(1)
 46 | 
 47 | 		self.num_users = user_idx
 48 | 		self.num_products = product_idx
 49 | 		self.matrix = csr_matrix((data, (row_idx, col_idx)), shape=(user_idx, product_idx)).asfptype()
 50 | 
 51 | 	@timer
 52 | 	def run(self, tau, k):
 53 | 		"""
 54 | 			run the algorithm.
 55 | 			tau: the percentile in reconstructed degree threshold under which a node is considered suspicious
 56 | 		"""
 57 | 		# k = 50 is selected based on Figure 3 of the paper
 58 | 		u, s, vt = svds(self.matrix, k=k)
 59 | 		# reconstructed out degree
 60 | 		self.recOutDeg = norm(u.dot(np.diag(s)), axis=1)
 61 | 		# reconstructed in degree
 62 | 		self.recInDeg = norm(vt.T.dot(np.diag(s)), axis=1)
 63 | 
 64 | 		# detect users
 65 | 		out_deg = self.matrix.sum(axis=1)
 66 | 		self.out_deg = np.array(out_deg).reshape(-1, )
 67 | 		self.unique_out_deg = np.unique(self.out_deg)
 68 | 
 69 | 		# store the indices of suspicious users
 70 | 		suspicious_users = {}
 71 | 		thresholds = {}
 72 | 		for d in self.unique_out_deg:
 73 | 			# find users with original degree = d
 74 | 			users = (self.out_deg == d)
 75 | 			user_deg = self.recOutDeg[users]
 76 | 			thresholds[d] = np.percentile(user_deg, tau)
 77 | 
 78 | 		for i in range(self.num_users):
 79 | 			user_d = self.out_deg[i]
 80 | 			if self.recOutDeg[i] < thresholds[user_d]:
 81 | 
 82 | 				if user_d not in suspicious_users:
 83 | 					suspicious_users[user_d] = []
 84 | 				suspicious_users[user_d].append(self.idx2u_id[i])
 85 | 
 86 | 		# detect products
 87 | 		in_deg = self.matrix.sum(axis=0)
 88 | 		self.in_deg = np.array(in_deg).reshape(-1, )
 89 | 		self.unique_in_deg = np.unique(self.in_deg)
 90 | 
 91 | 		# store the indices of suspicious users
 92 | 		suspicious_products = {}
 93 | 		thresholds = {}
 94 | 
 95 | 		for d in self.unique_in_deg:
 96 | 			prods = (self.in_deg == d)
 97 | 			prod_deg = self.recInDeg[prods]
 98 | 			thresholds[d] = np.percentile(prod_deg, tau)
 99 | 
100 | 		for i in range(self.num_products):
101 | 			prod_d = self.in_deg[i]
102 | 			if self.recInDeg[i] < thresholds[prod_d]:
103 | 				if prod_d not in suspicious_products:
104 | 					suspicious_products[prod_d] = []
105 | 				suspicious_products[prod_d].append(self.idx2p_id[i])
106 | 
107 | 		return suspicious_users, suspicious_products
108 | 
109 | 	def get_srms(self):
110 | 		"""
111 | 			return two matrices one for use the other for products
112 | 			each matrix has rows as reconstruction degree and column as old degree in the graph.
113 | 		"""
114 | 
115 | 		hist, edges = np.histogram(self.recOutDeg, bins=100)
116 | 		data = []
117 | 		rows = []
118 | 		cols = []
119 | 
120 | 		for d in self.unique_out_deg:
121 | 			user_deg = self.recOutDeg[self.out_deg == d]
122 | 			bin_indices = np.digitize(user_deg, edges)
123 | 			for i in bin_indices:
124 | 				data.append(1)
125 | 				rows.append(i)
126 | 				cols.append(d)
127 | 
128 | 		self.osrm = csr_matrix((data, (rows, cols)), shape=(len(edges) + 1, max(self.unique_out_deg) + 1))
129 | 
130 | 		hist, edges = np.histogram(self.recInDeg, bins=10)
131 | 		data = []
132 | 		rows = []
133 | 		cols = []
134 | 		for d in self.unique_in_deg:
135 | 			prod_deg = self.recInDeg[self.in_deg == d]
136 | 			bin_indices = np.digitize(prod_deg, edges)
137 | 			for i in bin_indices:
138 | 				data.append(1)
139 | 				rows.append(i)
140 | 				cols.append(d)
141 | 		self.isrm = csr_matrix((data, (rows, cols)))
142 | 
143 | 		return self.osrm, self.isrm
144 | 


--------------------------------------------------------------------------------
/UGFraud/Utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/UGFraud/Utils/helper.py:
--------------------------------------------------------------------------------
  1 | from sklearn.metrics import average_precision_score
  2 | from sklearn.metrics import roc_auc_score
  3 | import gzip
  4 | import numpy as np
  5 | import networkx as nx
  6 | import time
  7 | import functools
  8 | import warnings
  9 | 
 10 | 
 11 | def create_ground_truth(user_data):
 12 |     """Given user data, return a dictionary of labels of users and reviews
 13 |     Args:
 14 |         user_data: key = user_id, value = list of review tuples.
 15 |     Return:
 16 |         user_ground_truth: key = user id (not prefixed), value = 0 (non-spam) /1 (spam)
 17 |         review_ground_truth: review id (not prefixed), value = 0 (non-spam) /1 (spam)
 18 |     """
 19 |     user_ground_truth = {}
 20 |     review_ground_truth = {}
 21 | 
 22 |     for user_id, reviews in user_data.items():
 23 | 
 24 |         user_ground_truth[user_id] = 0
 25 | 
 26 |         for r in reviews:
 27 |             prod_id = r[0]
 28 |             label = r[2]
 29 | 
 30 |             if label == -1:
 31 |                 review_ground_truth[(user_id, prod_id)] = 1
 32 |                 user_ground_truth[user_id] = 1
 33 |             else:
 34 |                 review_ground_truth[(user_id, prod_id)] = 0
 35 | 
 36 |     return user_ground_truth, review_ground_truth
 37 | 
 38 | 
 39 | def evaluate(y, pred_y):
 40 |     """
 41 |     Revise: test when a key is a review/account.
 42 |     Evaluate the prediction of account and review by SpEagle
 43 |     Args:
 44 |         y: dictionary with key = user_id/review_id and value = ground truth (1 means spam, 0 means non-spam)
 45 |         pred_y: dictionary with key = user_id/review_id and value = p(y=spam | x) produced by SpEagle.
 46 |                 the keys in pred_y must be a subset of the keys in y
 47 |     """
 48 |     posteriors = []
 49 |     ground_truth = []
 50 | 
 51 |     for k, v in pred_y.items():
 52 |         if k in y:
 53 |             posteriors.append(v)
 54 |             ground_truth.append(y[k])
 55 | 
 56 |     if len(np.unique(ground_truth)) < 2:
 57 |         warnings.warn("Only one class present in ground_truth, ROC AUC score will be omitted")
 58 |         ap = average_precision_score(ground_truth, posteriors)
 59 |         return None, ap
 60 |     else:
 61 |         auc = roc_auc_score(ground_truth, posteriors)
 62 |         ap = average_precision_score(ground_truth, posteriors)
 63 |         return auc, ap
 64 | 
 65 | 
 66 | def scale_value(value_dict):
 67 |     """
 68 |     Calculate and return a dict of the value of input dict scaled to (0, 1)
 69 |     """
 70 | 
 71 |     ranked_dict = [(user, value_dict[user]) for user in value_dict.keys()]
 72 |     ranked_dict = sorted(ranked_dict, reverse=True, key=lambda x: x[1])
 73 | 
 74 |     up_max, up_mean, up_min = ranked_dict[0][1], ranked_dict[int(len(ranked_dict) / 2)][1], ranked_dict[-1][1]
 75 | 
 76 |     scale_dict = {}
 77 |     for i, p in value_dict.items():
 78 |         norm_value = (p - up_min) / (up_max - up_min)
 79 |         if norm_value == 0:  # avoid the 0
 80 |             scale_dict[i] = 0 + 1e-7
 81 |         elif norm_value == 1:  # avoid the 1
 82 |             scale_dict[i] = 1 - 1e-7
 83 |     else:
 84 |         scale_dict[i] = norm_value
 85 | 
 86 |     return scale_dict
 87 | 
 88 | 
 89 | def nor_priors(priors):
 90 |     """
 91 |     Normalize the node priors for GANG
 92 |     :param priors:
 93 |     :return:
 94 |     """
 95 |     new_upriors, new_rpriors, new_ppriors = priors
 96 | 
 97 |     # normalize the node priors to (0,1)
 98 |     # if we normalize the prior, we need to set nor_flg to True for the gang model
 99 |     ranked_upriors = [(user, new_upriors[user]) for user in new_upriors.keys()]
100 |     ranked_upriors = sorted(ranked_upriors, reverse=True, key=lambda x: x[1])
101 |     ranked_rpriors = [(user, new_rpriors[user]) for user in new_rpriors.keys()]
102 |     ranked_rpriors = sorted(ranked_rpriors, reverse=True, key=lambda x: x[1])
103 |     ranked_ppriors = [(user, new_ppriors[user]) for user in new_ppriors.keys()]
104 |     ranked_ppriors = sorted(ranked_ppriors, reverse=True, key=lambda x: x[1])
105 |     u_max, u_mean, u_min = ranked_upriors[0][1], ranked_upriors[int(len(ranked_upriors) / 2)][1], ranked_upriors[-1][1]
106 |     p_max, p_mean, p_min = ranked_ppriors[0][1], ranked_ppriors[int(len(ranked_ppriors) / 2)][1], ranked_ppriors[-1][1]
107 |     r_max, r_mean, r_min = ranked_rpriors[0][1], ranked_rpriors[int(len(ranked_rpriors) / 2)][1], ranked_rpriors[-1][1]
108 |     for i, p in priors[0].items():
109 |         priors[0][i] = (p - u_min) / (u_max - u_min)
110 |     for i, p in priors[1].items():
111 |         priors[1][i] = (p - r_min) / (r_max - r_min)
112 |     for i, p in priors[2].items():
113 |         priors[2][i] = (p - p_min) / (p_max - p_min)
114 | 
115 |     return priors, [u_mean, r_mean, p_mean]
116 | 
117 | 
118 | def get_hash(data):
119 |     import hashlib
120 |     return hashlib.md5(data).hexdigest()
121 | 
122 | 
123 | def read_graph_data(metadata_filename, adj=False):
124 |     """ Read the user-review-product graph from file. Can output the graph in different formats
125 |         Args:
126 |             metadata_filename: a gzipped file containing the graph.
127 |             adj: if True: create adjacent data, default is False
128 |         Return:
129 |             graph: user-review / prod-review / list of adjacent(adj=True)
130 |     """
131 | 
132 |     user_data = {}
133 | 
134 |     prod_data = {}
135 | 
136 |     adj_data = []
137 | 
138 |     # use the rt mode to read ascii strings instead of binary
139 |     if adj is False:
140 |         with gzip.open(metadata_filename, 'rt') as f:
141 |             # file format: each line is a tuple (user id, product id, rating, label, date)
142 |             for line in f:
143 |                 items = line.strip().split()
144 |                 u_id = items[0]
145 |                 p_id = items[1]
146 |                 if items[2] != 'None':
147 |                     rating = float(items[2])
148 |                 else:
149 |                     rating = 'None'
150 |                 label = int(items[3])
151 |                 date = items[4]
152 | 
153 |                 if u_id not in user_data:
154 |                     user_data[u_id] = []
155 |                 user_data[u_id].append((p_id, rating, label, date))
156 | 
157 |                 if p_id not in prod_data:
158 |                     prod_data[p_id] = []
159 |                 prod_data[p_id].append((u_id, rating, label, date))
160 | 
161 |             # create adj_list [u_id, p_id, 1/2], where 1 indicates positive rating (4, 5)
162 |             # and 2 indicates negative rating (1, 2, 3)
163 | 
164 |         print('read reviews from %s' % metadata_filename)
165 |         print('number of users = %d' % len(user_data))
166 |         print('number of products = %d' % len(prod_data))
167 |         return user_data, prod_data
168 |     else:
169 |         # create adj_list [u_id, p_id, 1/2], where 1 indicates positive rating (4, 5)
170 |         # and 2 indicates negative rating (1, 2, 3)
171 |         with gzip.open(metadata_filename, 'rt') as f:
172 |             # file format: each line is a tuple (user id, product id, rating, label, date)
173 |             for line in f:
174 |                 items = line.strip().split()
175 |                 u_id = items[0]
176 |                 p_id = items[1]
177 |                 if items[2] != 'None':
178 |                     rating = float(items[2])
179 |                 else:
180 |                     rating = 'None'
181 |                 label = int(items[3])
182 |                 date = items[4]
183 | 
184 |                 if u_id not in user_data:
185 |                     user_data[u_id] = []
186 |                 user_data[u_id].append((p_id, rating, label, date))
187 | 
188 |                 if p_id not in prod_data:
189 |                     prod_data[p_id] = []
190 |                 prod_data[p_id].append((u_id, rating, label, date))
191 | 
192 |                 if int(rating) <= 3:
193 |                     rating = int(2)
194 |                 else:
195 |                     rating = int(1)
196 |                 adj_data.append([u_id, p_id, rating])
197 | 
198 |         print('read reviews from %s' % metadata_filename)
199 |         print('number of users = %d' % len(user_data))
200 |         print('number of products = %d' % len(prod_data))
201 |         print('number of ratings = %d' % len(adj_data))
202 |         return user_data, prod_data, np.array(adj_data, dtype='int32')
203 | 
204 | 
205 | def depth(data):
206 |     """
207 |     Get the depth of a dictionary
208 |     Args:
209 |         data: data in dictionary type
210 | 
211 |     Returns: the depth of a dictionary
212 | 
213 |     """
214 |     if isinstance(data, dict):
215 |         return 1 + (max(map(depth, data.values())) if data else 0)
216 |     return 0
217 | 
218 | 
219 | def data_checker(data):
220 |     """
221 |     data validation
222 |     Args:
223 |         data: data in dictionary type
224 | 
225 |     Returns: pass the validation
226 | 
227 |     """
228 |     if isinstance(data, dict):
229 |         if depth(data) < 3:
230 |             raise Exception("The minimum depth of data must be 3. For example: {\'node1\':{\'node1_neighbor\':{"
231 |                             "neighbor's attribute}}}")
232 |     else:
233 |         raise AttributeError("Data must be stored in dictionary.")
234 | 
235 | 
236 | def dict_to_networkx(data):
237 |     """
238 |     Convert data into networkx graph
239 |     Args:
240 |         data: data in dictionary type
241 | 
242 |     Returns: networkx graph
243 | 
244 |     """
245 |     data_checker(data)
246 |     G = nx.Graph(data)
247 |     return G
248 | 
249 | 
250 | def add_attribute_to_graph(graph, attribute, adding_type):
251 |     """
252 |     Add new attributes to nodes/edges
253 |     Args:
254 |         graph: networkx graph
255 |         attribute: dictionary of attributes for nodes/edges
256 |         adding_type: string of node or edge
257 | 
258 |     Returns:
259 |         networkx graph with new attributes
260 |     """
261 |     if isinstance(attribute, dict):
262 |         if isinstance(graph, nx.classes.graph.Graph):
263 |             if adding_type == 'node':
264 |                 nx.set_node_attributes(graph, attribute)
265 |                 return graph
266 |             elif adding_type == 'edge':
267 |                 nx.set_edge_attributes(graph, attribute)
268 |                 return graph
269 |             else:
270 |                 raise Exception("Adding type must be \'node\' or \'edge\'.")
271 |         else:
272 |             raise Exception("The graph must be a networkx graph.")
273 |     else:
274 |         raise AttributeError("Attribute must be stored in dictionary.")
275 | 
276 | 
277 | def get_node_attributes_index(graph, attr):
278 |     """
279 |     get node index for each attributes
280 |     Args:
281 |         graph: networkx graph
282 |         attr: nodes' attribute
283 | 
284 |     Returns:
285 |         a dict of list which contains every attribute index
286 |         For example: {'user': ['201','202','203','204'], 'prod': ['0', '1', '2']}
287 |     """
288 |     from collections import defaultdict
289 |     node_temp = nx.get_node_attributes(graph, attr)
290 |     reversed_dict = defaultdict(list)
291 |     for key, value in node_temp.items():
292 |         reversed_dict[value].append(key)
293 |     return reversed_dict
294 | 
295 | 
296 | def get_edge_attributes_index(graph, attr):
297 |     """
298 |     get edge index for each attributes
299 |     Args:
300 |         graph: networkx graph
301 |         attr: edges' attribute
302 | 
303 |     Returns:
304 |         a dict of list which contains every attribute index
305 |         For example: {'review': [('201', '0'), ('202', '0'), ('203', '0'), ('204', '0')]}
306 |     """
307 |     from collections import defaultdict
308 |     node_temp = nx.get_edge_attributes(graph, attr)
309 |     reversed_dict = defaultdict(list)
310 |     for key, value in node_temp.items():
311 |         reversed_dict[value].append(key)
312 |     return reversed_dict
313 | 
314 | 
315 | def node_attr_filter(graph, attr, specific_attr, into_attr):
316 |     """
317 |     get specific keys, values in conditions
318 |     Args:
319 |         graph: networkx graph
320 |         attr: which attribute index you want to get
321 |         specific_attr: which specific attribute index you want to get depending on attr
322 |         into_attr: use specific attribute index to filter the attribute
323 | 
324 |     Returns:
325 |         dict(node: into_attr values)
326 |         For example: node_attr_filter(graph, 'types', 'user', 'prior)
327 |         will return the dict( user_id: user_id_prior)
328 | 
329 |     """
330 |     attr_dict_index = get_node_attributes_index(graph, attr)
331 |     specific_dict = attr_dict_index[specific_attr]
332 |     filtered_dict = dict()
333 |     into_dict = nx.get_node_attributes(graph, into_attr)
334 |     for i in specific_dict:
335 |         filtered_dict[i] = into_dict[i]
336 |     return filtered_dict
337 | 
338 | 
339 | def edge_attr_filter(graph, attr, specific_attr, into_attr):
340 |     """
341 |     get specific keys, values in conditions
342 |     Args:
343 |         graph: networkx graph
344 |         attr: which attribute index you want to get
345 |         specific_attr: which specific attribute index you want to get depending on attr
346 |         into_attr: use specific attribute index to filter the attribute
347 | 
348 |     Returns:
349 |         dict(edge: into_attr values)
350 |         For example: edge_attr_filter(graph, 'types', 'review', 'prior)
351 |         will return the dict(review_id: review_id_prior)
352 | 
353 |     """
354 |     attr_dict_index = get_edge_attributes_index(graph, attr)
355 |     specific_dict = attr_dict_index[specific_attr]
356 |     filtered_dict = dict()
357 |     into_dict = nx.get_edge_attributes(graph, into_attr)
358 |     for i in specific_dict:
359 |         filtered_dict[i] = into_dict[i]
360 |     return filtered_dict
361 | 
362 | 
363 | def save_graph(graph, graph_name=False):
364 |     """
365 | 
366 |     Args:
367 |         graph: network graph
368 |         graph_name: the file name of the graph, if graph_name=False, use default name
369 | 
370 |     Returns:
371 |         None
372 |     """
373 |     from networkx.readwrite import json_graph
374 |     import json
375 |     data = json_graph.node_link_data(graph)
376 |     if graph_name is False:
377 |         graph_name = 'graph_data.json'
378 |     with open(graph_name, 'w') as f:
379 |         json.dump(data, f)
380 |     f.close()
381 |     print('Saved graph data as {}'.format(graph_name))
382 | 
383 | 
384 | def load_graph(json_name):
385 |     """
386 | 
387 |     Args:
388 |         json_name: json file name
389 | 
390 |     Returns:
391 |         networkx graph
392 |     """
393 |     from networkx.readwrite import json_graph
394 |     import json
395 |     with open(json_name, 'r') as f:
396 |         data = json.load(f)
397 |     f.close()
398 |     graph = json_graph.node_link_graph(data)
399 |     print('Loaded {} into the nextorkx graph'.format(json_name))
400 |     return graph
401 | 
402 | 
403 | def timer(func):
404 |     """Print the runtime of the decorated function"""
405 |     @functools.wraps(func)
406 |     def wrapper_timer(*args, **kwargs):
407 |         start_time = time.perf_counter()
408 |         value = func(*args, **kwargs)
409 |         end_time = time.perf_counter()
410 |         run_time = end_time - start_time
411 |         print("Finished {} in {} secs".format(func.__name__, round(run_time, 3)))
412 |         return value
413 |     return wrapper_timer
414 | 
415 | 


--------------------------------------------------------------------------------
/UGFraud/Yelp_Data/YelpChi/metadata.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/safe-graph/UGFraud/b47ac521d6a0fcc1d8880619275c9d48ccfa2997/UGFraud/Yelp_Data/YelpChi/metadata.gz


--------------------------------------------------------------------------------
/UGFraud/Yelp_Data/YelpChi/priors.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/safe-graph/UGFraud/b47ac521d6a0fcc1d8880619275c9d48ccfa2997/UGFraud/Yelp_Data/YelpChi/priors.pkl


--------------------------------------------------------------------------------
/UGFraud/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-


--------------------------------------------------------------------------------
/UGFraud_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/safe-graph/UGFraud/b47ac521d6a0fcc1d8880619275c9d48ccfa2997/UGFraud_logo.png


--------------------------------------------------------------------------------
/reference/fbox.txt:
--------------------------------------------------------------------------------
1 | @inproceedings{shah2014spotting,
2 |   title={Spotting suspicious link behavior with fbox: An adversarial perspective},
3 |   author={Shah, Neil and Beutel, Alex and Gallagher, Brian and Faloutsos, Christos},
4 |   booktitle={2014 IEEE International Conference on Data Mining},
5 |   pages={959--964},
6 |   year={2014},
7 |   organization={IEEE}
8 | }


--------------------------------------------------------------------------------
/reference/fraudar.txt:
--------------------------------------------------------------------------------
1 | @inproceedings{hooi2016fraudar,
2 |   title={Fraudar: Bounding graph fraud in the face of camouflage},
3 |   author={Hooi, Bryan and Song, Hyun Ah and Beutel, Alex and Shah, Neil and Shin, Kijung and Faloutsos, Christos},
4 |   booktitle={Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
5 |   pages={895--904},
6 |   year={2016}
7 | }


--------------------------------------------------------------------------------
/reference/gang.txt:
--------------------------------------------------------------------------------
1 | @inproceedings{wang2017gang,
2 |   title={GANG: Detecting fraudulent users in online social networks via guilt-by-association on directed graphs},
3 |   author={Wang, Binghui and Gong, Neil Zhenqiang and Fu, Hao},
4 |   booktitle={2017 IEEE International Conference on Data Mining (ICDM)},
5 |   pages={465--474},
6 |   year={2017},
7 |   organization={IEEE}
8 | }


--------------------------------------------------------------------------------
/reference/speagle.txt:
--------------------------------------------------------------------------------
1 | @inproceedings{rayana2015collective,
2 |   title={Collective opinion spam detection: Bridging review networks and metadata},
3 |   author={Rayana, Shebuti and Akoglu, Leman},
4 |   booktitle={Proceedings of the 21th acm sigkdd international conference on knowledge discovery and data mining},
5 |   pages={985--994},
6 |   year={2015}
7 | }


--------------------------------------------------------------------------------
/reference/svd.txt:
--------------------------------------------------------------------------------
1 | @incollection{golub1971singular,
2 |   title={Singular value decomposition and least squares solutions},
3 |   author={Golub, Gene H and Reinsch, Christian},
4 |   booktitle={Linear Algebra},
5 |   pages={134--151},
6 |   year={1971},
7 |   publisher={Springer}
8 | }


--------------------------------------------------------------------------------
/reference/zoobp.txt:
--------------------------------------------------------------------------------
 1 | @article{eswaran2017zoobp,
 2 |   title={Zoobp: Belief propagation for heterogeneous networks},
 3 |   author={Eswaran, Dhivya and G{\"u}nnemann, Stephan and Faloutsos, Christos and Makhija, Disha and Kumar, Mohit},
 4 |   journal={Proceedings of the VLDB Endowment},
 5 |   volume={10},
 6 |   number={5},
 7 |   pages={625--636},
 8 |   year={2017},
 9 |   publisher={VLDB Endowment}
10 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | networkx>=2.2
2 | numpy>=1.16.6
3 | scipy>=1.2.3
4 | scikit-learn>=0.20.4


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | from os import path
 3 | from io import open  # for Python 2 and 3 compatibility
 4 | 
 5 | this_directory = path.abspath(path.dirname(__file__))
 6 | 
 7 | # read the contents of requirements.txt
 8 | with open(path.join(this_directory, 'requirements.txt'),
 9 |           encoding='utf-8') as f:
10 |     requirements = f.read().splitlines()
11 | 
12 | with open("README.md", "r") as fh:
13 |     long_description = fh.read()
14 | 
15 | setuptools.setup(
16 |     name="UGFraud", # Replace with your own username
17 |     version="0.1.1.2",
18 |     author="Yingtong Dou, Chen Wang, Sihong Xie, Guixiang Ma, and UIC BDSC Lab",
19 |     author_email="bdscsafegraph@gmail.com",
20 |     description="An Unsupervised Graph-based Toolbox for Fraud Detection",
21 |     long_description=long_description,
22 |     include_package_data=True,
23 |     long_description_content_type="text/markdown",
24 |     url="https://github.com/safe-graph/UGFraud",
25 |     download_url='https://github.com/safe-graph/UGFraud/archive/master.zip',
26 |     keywords=['fraud detection', 'anomaly detection', 'graph algorithm',
27 |                 'data mining', 'security'],
28 |     package_data={
29 |         # If any package contains *.txt or *.rst files, include them:
30 |         "UGFraud": ["Yelp_Data/YelpChi/*.gz", "Yelp_Data/YelpChi/*.pkl"]},
31 |     packages=setuptools.find_packages(),
32 |     classifiers=[
33 |         'Programming Language :: Python :: 3.6',
34 |         'Programming Language :: Python :: 3.7',
35 |         'Development Status :: 5 - Production/Stable',
36 |         'Intended Audience :: Education',
37 |         'Intended Audience :: Financial and Insurance Industry',
38 |         'Intended Audience :: Science/Research',
39 |         'Intended Audience :: Developers',
40 |         'Intended Audience :: Information Technology',
41 |         'License :: OSI Approved :: Apache Software License',
42 |         'Operating System :: OS Independent',
43 |     ],
44 |     python_requires='>=3.6',
45 | )
46 | 


--------------------------------------------------------------------------------
/tests/testing.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  4 | from UGFraud.Demo.eval_fBox import *
  5 | from UGFraud.Demo.eval_Fraudar import *
  6 | from UGFraud.Demo.eval_GANG import *
  7 | from UGFraud.Demo.eval_SpEagle import *
  8 | from UGFraud.Demo.eval_SVD import *
  9 | from UGFraud.Demo.eval_ZooBP import *
 10 | from UGFraud.Demo.demo_pre import *
 11 | 
 12 | 
 13 | sys.path.insert(0, os.path.abspath('../UGFraud/Demo/'))
 14 | # data source
 15 | file_name = 'Yelp_graph_data.json'
 16 | # path_name = sys.path[0] + '/' + file_name
 17 | try:
 18 |     G = load_graph(file_name)
 19 | except FileNotFoundError:
 20 |     data_path = 'UGFraud/Yelp_Data/'
 21 |     data_to_network_graph(data_path)
 22 |     G = load_graph(file_name)
 23 | user_ground_truth = node_attr_filter(G, 'types', 'user', 'label')
 24 | review_ground_truth = edge_attr_filter(G, 'types', 'review', 'label')
 25 | 
 26 | """
 27 |     testing fBox
 28 | """
 29 | print("*" * 80)
 30 | print("Testing fBox")
 31 | t = 20  # taus = [0.5, 1, 5, 10, 25, 50, 99]
 32 | k = 50  # k = range(10, 51, 10)
 33 | serBelief, reviewBelief = runfBox(G, t, k)
 34 | reviewBelief = scale_value(reviewBelief)
 35 | 
 36 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 37 | print('review AUC = {}'.format(review_AUC))
 38 | print('review AP  = {}'.format(review_AP))
 39 | 
 40 | """
 41 |     testing Fraudar
 42 | """
 43 | print("*" * 80)
 44 | print("Testing Fraudar")
 45 | userBelief, reviewBelief = runFraudar(G, multiple=0)
 46 | reviewBelief = scale_value(reviewBelief)
 47 | 
 48 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 49 | print('review AUC = {}'.format(review_AUC))
 50 | print('review AP  = {}'.format(review_AP))
 51 | 
 52 | """
 53 |     testing GANG
 54 | """
 55 | print("*" * 80)
 56 | print("Testing GANG")
 57 | # add semi-supervised user information / threshold
 58 | sup_per = 0.1
 59 | 
 60 | # run GANG model
 61 | model = GANG(G, user_ground_truth, sup_per, nor_flg=True, sup_flg=False)
 62 | 
 63 | # run Linearized Belief Propagation on product-user matrix with 1000 iterations
 64 | iteration = 1000
 65 | model.pu_lbp(iteration)
 66 | userBelief, _, reviewBelief = model.classify()
 67 | reviewBelief = scale_value(reviewBelief)
 68 | 
 69 | # evaluation
 70 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 71 | print('review AUC = {}'.format(review_AUC))
 72 | print('review AP  = {}'.format(review_AP))
 73 | 
 74 | """
 75 |     testing Prior
 76 | """
 77 | print("*" * 80)
 78 | print("Testing Prior")
 79 | # normalize the review prior as the review suspicious belief
 80 | rpriors = edge_attr_filter(G, 'types', 'review', 'prior')
 81 | reviewBelief = scale_value(rpriors)
 82 | 
 83 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
 84 | print('review AUC = {}'.format(review_AUC))
 85 | print('review AP  = {}'.format(review_AP))
 86 | 
 87 | """
 88 |     testing SpEagle
 89 | """
 90 | print("*" * 80)
 91 | print("Testing SpEagle")
 92 | # input parameters: numerical_eps, eps, num_iters, stop_threshold
 93 | numerical_eps = 1e-5
 94 | eps = 0.1
 95 | user_review_potential = np.log(np.array([[1 - numerical_eps, numerical_eps], [numerical_eps, 1 - numerical_eps]]))
 96 | review_product_potential = np.log(np.array([[1 - eps, eps], [eps, 1 - eps]]))
 97 | potentials = {'u_r': user_review_potential, 'r_u': user_review_potential,
 98 |               'r_p': review_product_potential, 'p_r': review_product_potential}
 99 | max_iters = 4
100 | stop_threshold = 1e-3
101 | 
102 | model = SpEagle(G, potentials, message=None, max_iters=4)
103 | 
104 | # new runbp func
105 | model.schedule(schedule_type='bfs')
106 | 
107 | iter = 0
108 | num_bp_iters = 2
109 | model.run_bp(start_iter=iter, max_iters=num_bp_iters, tol=stop_threshold)
110 | 
111 | userBelief, reviewBelief, _ = model.classify()
112 | 
113 | review_AUC, review_AP = evaluate(review_ground_truth, reviewBelief)
114 | print('review AUC = {}'.format(review_AUC))
115 | print('review AP  = {}'.format(review_AP))
116 | 
117 | """
118 |     testing SVD
119 | """
120 | print("*" * 80)
121 | print("Testing SVD")
122 | percent = 0.9
123 | model = SVD(G)
124 | svd_output = model.run(percent)
125 | result = model.evaluate_SVD(svd_output, G)
126 | index = list(map(str, map(int, result[0])))
127 | userBelief = dict(zip(index, result[1]))
128 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief)
129 | print('review AUC = {}'.format(review_AUC))
130 | print('review AP  = {}'.format(review_AP))
131 | 
132 | """
133 |     testing ZooBP
134 | """
135 | print("*" * 80)
136 | print("Testing ZooBp")
137 | ep = 0.01
138 | #  H: compatibility matrix
139 | H = np.array([[0.5, -0.5], [-0.5, 0.5]])
140 | 
141 | model = ZooBP(G, ep, H)
142 | userBelief, _ = model.run()  # result = (user_beliefs, prod_beliefs)
143 | 
144 | review_AUC, review_AP = evaluate(user_ground_truth, userBelief)
145 | print('review AUC = {}'.format(review_AUC))
146 | print('review AP  = {}'.format(review_AP))
147 | 


--------------------------------------------------------------------------------