├── .gitignore ├── ArmTemplates ├── configure_arm.json ├── dsvm_arm.json ├── fraud-detection_arm.json └── fraud-detection_hdi_arm.json ├── Data ├── Account_Info.csv ├── Fraud_Transactions.csv ├── Untagged_Transactions.csv └── readme.md ├── LICENSE ├── R ├── Fraud.rproj ├── Fraud.rxproj ├── Fraud.sln ├── Fraud_Detection_Notebook.ipynb ├── modeling_main.R ├── readme.md ├── step1_tagging.R ├── step2_splitting_preprocessing.R ├── step3_feature_engineering.R └── step4_training_evaluation.R ├── README.md ├── RSparkCluster ├── copy_dev_to_prod.R ├── development_main.R ├── in_memory_scoring.R ├── production_main.R ├── readme.md ├── step0_directories_creation.R ├── step1_merge_account_info.R ├── step2_tagging.R ├── step3_splitting.R ├── step4_preprocessing.R ├── step5_create_risk_tables.R ├── step6_feature_engineering.R ├── step7_training.R ├── step8_prediction.R ├── step9_evaluation.R └── web_scoring_main.R ├── Resources ├── ActionScripts │ ├── ConfigureSQL.ps1 │ ├── CreateDatabase.sql │ ├── CreateSQLObjectsR.sql │ ├── FraudSetup.ps1 │ ├── SetupVM.ps1 │ ├── createShortcuts.ps1 │ ├── frauddetection_Help.url │ └── hdisetup.sh ├── exampleuser.sql ├── images │ └── fraud.jpg └── readme.md ├── SECURITY.md ├── SQLR ├── CreateRiskTable.sql ├── OnlineFraudDetection.ps1 ├── ParseString.sql ├── ScoreOneTrans.sql ├── SortAcctTable.sql ├── Step0_CreateTables.sql ├── Step10A_Evaluation.sql ├── Step10B_Evaluation_AUC.sql ├── Step1_MergeAcctInfo.sql ├── Step2_Tagging.sql ├── Step3_SplitData.sql ├── Step4_Preprocess.sql ├── Step5_Save2History.sql ├── Step6_CreateRiskTables.sql ├── Step7_FeatureEngineer.sql ├── Step8_Training.sql ├── Step9_Prediction.sql ├── UtilityFunctions.sql ├── createuser.sql └── readme.md ├── Website ├── package.json ├── public │ ├── css │ │ ├── bootstrap.min.css │ │ ├── bootswatch.css │ │ └── myCSS.css │ ├── fonts │ │ ├── glyphicons-halflings-regular.eot │ │ ├── glyphicons-halflings-regular.svg │ │ ├── glyphicons-halflings-regular.ttf │ │ └── glyphicons-halflings-regular.woff │ ├── img │ │ ├── bracelet.jpg │ │ ├── earrings.jpg │ │ ├── heart.jpg │ │ ├── logo.gif │ │ ├── progress.gif │ │ └── ring.jpg │ └── js │ │ ├── boostrap.min.js │ │ ├── customize.js │ │ ├── jquery-3.2.1.min.js │ │ ├── jquery-3.2.1.min.map │ │ ├── scoreClaim.js │ │ └── startUp.js ├── readme.md ├── server.js ├── startMe.vbs └── views │ ├── home.handlebars │ └── layouts │ └── main.handlebars ├── onlinefraud.pbix └── onlinefraudHDI.pbix /.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | ## 4 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 5 | 6 | # User-specific files 7 | *.suo 8 | *.user 9 | *.userosscache 10 | *.sln.docstates 11 | 12 | # User-specific files (MonoDevelop/Xamarin Studio) 13 | *.userprefs 14 | 15 | # Build results 16 | [Dd]ebug/ 17 | [Dd]ebugPublic/ 18 | [Rr]elease/ 19 | [Rr]eleases/ 20 | x64/ 21 | x86/ 22 | bld/ 23 | [Bb]in/ 24 | [Oo]bj/ 25 | [Ll]og/ 26 | 27 | # Visual Studio 2015 cache/options directory 28 | .vs/ 29 | # Uncomment if you have tasks that create the project's static files in wwwroot 30 | #wwwroot/ 31 | 32 | # MSTest test Results 33 | [Tt]est[Rr]esult*/ 34 | [Bb]uild[Ll]og.* 35 | 36 | # NUNIT 37 | *.VisualState.xml 38 | TestResult.xml 39 | 40 | # Build Results of an ATL Project 41 | [Dd]ebugPS/ 42 | [Rr]eleasePS/ 43 | dlldata.c 44 | 45 | # .NET Core 46 | project.lock.json 47 | project.fragment.lock.json 48 | artifacts/ 49 | **/Properties/launchSettings.json 50 | 51 | *_i.c 52 | *_p.c 53 | *_i.h 54 | *.ilk 55 | *.meta 56 | *.obj 57 | *.pch 58 | *.pdb 59 | *.pgc 60 | *.pgd 61 | *.rsp 62 | *.sbr 63 | *.tlb 64 | *.tli 65 | *.tlh 66 | *.tmp 67 | *.tmp_proj 68 | *.log 69 | *.vspscc 70 | *.vssscc 71 | .builds 72 | *.pidb 73 | *.svclog 74 | *.scc 75 | 76 | # Chutzpah Test files 77 | _Chutzpah* 78 | 79 | # Visual C++ cache files 80 | ipch/ 81 | *.aps 82 | *.ncb 83 | *.opendb 84 | *.opensdf 85 | *.sdf 86 | *.cachefile 87 | *.VC.db 88 | *.VC.VC.opendb 89 | 90 | # Visual Studio profiler 91 | *.psess 92 | *.vsp 93 | *.vspx 94 | *.sap 95 | 96 | # TFS 2012 Local Workspace 97 | $tf/ 98 | 99 | # Guidance Automation Toolkit 100 | *.gpState 101 | 102 | # ReSharper is a .NET coding add-in 103 | _ReSharper*/ 104 | *.[Rr]e[Ss]harper 105 | *.DotSettings.user 106 | 107 | # JustCode is a .NET coding add-in 108 | .JustCode 109 | 110 | # TeamCity is a build add-in 111 | _TeamCity* 112 | 113 | # DotCover is a Code Coverage Tool 114 | *.dotCover 115 | 116 | # Visual Studio code coverage results 117 | *.coverage 118 | *.coveragexml 119 | 120 | # NCrunch 121 | _NCrunch_* 122 | .*crunch*.local.xml 123 | nCrunchTemp_* 124 | 125 | # MightyMoose 126 | *.mm.* 127 | AutoTest.Net/ 128 | 129 | # Web workbench (sass) 130 | .sass-cache/ 131 | 132 | # Installshield output folder 133 | [Ee]xpress/ 134 | 135 | # DocProject is a documentation generator add-in 136 | DocProject/buildhelp/ 137 | DocProject/Help/*.HxT 138 | DocProject/Help/*.HxC 139 | DocProject/Help/*.hhc 140 | DocProject/Help/*.hhk 141 | DocProject/Help/*.hhp 142 | DocProject/Help/Html2 143 | DocProject/Help/html 144 | 145 | # Click-Once directory 146 | publish/ 147 | 148 | # Publish Web Output 149 | *.[Pp]ublish.xml 150 | *.azurePubxml 151 | # TODO: Comment the next line if you want to checkin your web deploy settings 152 | # but database connection strings (with potential passwords) will be unencrypted 153 | *.pubxml 154 | *.publishproj 155 | 156 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 157 | # checkin your Azure Web App publish settings, but sensitive information contained 158 | # in these scripts will be unencrypted 159 | PublishScripts/ 160 | 161 | # NuGet Packages 162 | *.nupkg 163 | # The packages folder can be ignored because of Package Restore 164 | **/packages/* 165 | # except build/, which is used as an MSBuild target. 166 | !**/packages/build/ 167 | # Uncomment if necessary however generally it will be regenerated when needed 168 | #!**/packages/repositories.config 169 | # NuGet v3's project.json files produces more ignorable files 170 | *.nuget.props 171 | *.nuget.targets 172 | 173 | # Microsoft Azure Build Output 174 | csx/ 175 | *.build.csdef 176 | 177 | # Microsoft Azure Emulator 178 | ecf/ 179 | rcf/ 180 | 181 | # Windows Store app package directories and files 182 | AppPackages/ 183 | BundleArtifacts/ 184 | Package.StoreAssociation.xml 185 | _pkginfo.txt 186 | 187 | # Visual Studio cache files 188 | # files ending in .cache can be ignored 189 | *.[Cc]ache 190 | # but keep track of directories ending in .cache 191 | !*.[Cc]ache/ 192 | 193 | # Others 194 | ClientBin/ 195 | ~$* 196 | *~ 197 | *.dbmdl 198 | *.dbproj.schemaview 199 | *.jfm 200 | *.pfx 201 | *.publishsettings 202 | orleans.codegen.cs 203 | 204 | # Since there are multiple workflows, uncomment next line to ignore bower_components 205 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 206 | #bower_components/ 207 | 208 | # RIA/Silverlight projects 209 | Generated_Code/ 210 | 211 | # Backup & report files from converting an old project file 212 | # to a newer Visual Studio version. Backup files are not needed, 213 | # because we have git ;-) 214 | _UpgradeReport_Files/ 215 | Backup*/ 216 | UpgradeLog*.XML 217 | UpgradeLog*.htm 218 | 219 | # SQL Server files 220 | *.mdf 221 | *.ldf 222 | *.ndf 223 | 224 | # Business Intelligence projects 225 | *.rdl.data 226 | *.bim.layout 227 | *.bim_*.settings 228 | 229 | # Microsoft Fakes 230 | FakesAssemblies/ 231 | 232 | # GhostDoc plugin setting file 233 | *.GhostDoc.xml 234 | 235 | # Node.js Tools for Visual Studio 236 | .ntvs_analysis.dat 237 | node_modules/ 238 | 239 | # Typescript v1 declaration files 240 | typings/ 241 | 242 | # Visual Studio 6 build log 243 | *.plg 244 | 245 | # Visual Studio 6 workspace options file 246 | *.opt 247 | 248 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 249 | *.vbw 250 | 251 | # Visual Studio LightSwitch build output 252 | **/*.HTMLClient/GeneratedArtifacts 253 | **/*.DesktopClient/GeneratedArtifacts 254 | **/*.DesktopClient/ModelManifest.xml 255 | **/*.Server/GeneratedArtifacts 256 | **/*.Server/ModelManifest.xml 257 | _Pvt_Extensions 258 | 259 | # Paket dependency manager 260 | .paket/paket.exe 261 | paket-files/ 262 | 263 | # FAKE - F# Make 264 | .fake/ 265 | 266 | # JetBrains Rider 267 | .idea/ 268 | *.sln.iml 269 | 270 | # CodeRush 271 | .cr/ 272 | 273 | # Python Tools for Visual Studio (PTVS) 274 | __pycache__/ 275 | *.pyc 276 | 277 | # Cake - Uncomment if you are using it 278 | # tools/** 279 | # !tools/packages.config 280 | 281 | # Telerik's JustMock configuration file 282 | *.jmconfig 283 | 284 | # BizTalk build output 285 | *.btp.cs 286 | *.btm.cs 287 | *.odx.cs 288 | *.xsd.cs 289 | -------------------------------------------------------------------------------- /ArmTemplates/configure_arm.json: -------------------------------------------------------------------------------- 1 | { 2 | "contentVersion": "1.0.0.0", 3 | "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 4 | "parameters": { 5 | "adminUsername": { 6 | "type": "string", 7 | "metadata": { 8 | "description": "Username for the Virtual Machine." 9 | } 10 | }, 11 | "adminPassword": { 12 | "type": "securestring", 13 | "metadata": { 14 | "description": "Password for the Virtual Machine." 15 | } 16 | }, 17 | "vmName": { 18 | "type": "string", 19 | "metadata": { 20 | "description": "Name for the Virtual Machine." 21 | } 22 | } 23 | }, 24 | "variables": { 25 | "location": "[resourceGroup().location]" 26 | }, 27 | "resources": [ 28 | { 29 | "type": "Microsoft.Compute/virtualMachines/extensions", 30 | "name": "[concat(parameters('vmName'),'/FraudDetectionSetup')]", 31 | "apiVersion": "2015-05-01-preview", 32 | "location": "[variables('location')]", 33 | "properties": { 34 | "publisher": "Microsoft.Compute", 35 | "type": "CustomScriptExtension", 36 | "typeHandlerVersion": "1.7", 37 | "autoUpgradeMinorVersion": false, 38 | "settings": { 39 | "fileUris": [ 40 | "https://raw.githubusercontent.com/Microsoft/r-server-fraud-detection/master/Resources/ActionScripts/FraudSetup.ps1" 41 | ], 42 | "commandToExecute": "[concat('powershell.exe -ExecutionPolicy Unrestricted -File FraudSetup.ps1 -serverName ', parameters('vmName') ,' -username ',parameters('adminUsername'),' -password ',parameters('adminPassword'))]" 43 | } 44 | } 45 | } 46 | ], 47 | "outputs": { 48 | } 49 | } -------------------------------------------------------------------------------- /ArmTemplates/dsvm_arm.json: -------------------------------------------------------------------------------- 1 | { 2 | "contentVersion": "1.0.0.0", 3 | "$schema": "http://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 4 | "parameters": { 5 | "adminUsername": { 6 | "type": "string", 7 | "metadata": { 8 | "description": "Username for the Virtual Machine" 9 | } 10 | }, 11 | "adminPassword": { 12 | "type": "securestring", 13 | "metadata": { 14 | "description": "Password for the Virtual Machine" 15 | } 16 | }, 17 | "vmName": { 18 | "type": "string", 19 | "metadata": { 20 | "description": "Name for the Virtual Machine" 21 | } 22 | }, 23 | "vmSize": { 24 | "type": "string", 25 | "metadata": { 26 | "description": "Size for the Virtual Machine" 27 | } 28 | } 29 | }, 30 | "variables": { 31 | "apiVersion": "2015-10-01", 32 | "location": "[resourceGroup().location]", 33 | "imagePublisher": "microsoft-ads", 34 | "imageOffer": "windows-data-science-vm", 35 | "sku": "windows2016", 36 | "version": "03.25.19", 37 | "OSDiskName": "osdiskforwindowssimple", 38 | "nicName": "[parameters('vmName')]", 39 | "addressPrefix": "10.0.0.0/16", 40 | "subnetName": "Subnet", 41 | "subnetPrefix": "10.0.0.0/24", 42 | "storageAccountType": "Standard_LRS", 43 | "storageAccountName": "[concat(uniquestring(resourceGroup().id), 'windsvm')]", 44 | "publicIPAddressType": "Dynamic", 45 | "publicIPAddressName": "[tolower(concat('co',parameters('vmName'),uniquestring(resourceGroup().id)))]", 46 | "vmStorageAccountContainerName": "vhds", 47 | "vmName": "[parameters('vmName')]", 48 | "vmSize": "[parameters('vmSize')]", 49 | "virtualNetworkName": "[parameters('vmName')]", 50 | "vnetID": "[resourceId('Microsoft.Network/virtualNetworks',variables('virtualNetworkName'))]", 51 | "subnetRef": "[concat(variables('vnetID'),'/subnets/',variables('subnetName'))]" 52 | }, 53 | "resources": [ 54 | { 55 | "type": "Microsoft.Storage/storageAccounts", 56 | "name": "[variables('storageAccountName')]", 57 | "apiVersion": "2015-05-01-preview", 58 | "location": "[variables('location')]", 59 | "properties": { 60 | "accountType": "[variables('storageAccountType')]" 61 | } 62 | }, 63 | { 64 | "apiVersion": "2015-05-01-preview", 65 | "type": "Microsoft.Network/publicIPAddresses", 66 | "name": "[variables('publicIPAddressName')]", 67 | "location": "[variables('location')]", 68 | "properties": { 69 | "publicIPAllocationMethod": "[variables('publicIPAddressType')]", 70 | "dnsSettings": { 71 | "domainNameLabel": "[variables('publicIPAddressName')]" 72 | } 73 | } 74 | }, 75 | { 76 | "apiVersion": "2015-05-01-preview", 77 | "type": "Microsoft.Network/virtualNetworks", 78 | "name": "[variables('virtualNetworkName')]", 79 | "location": "[variables('location')]", 80 | "properties": { 81 | "addressSpace": { 82 | "addressPrefixes": [ 83 | "[variables('addressPrefix')]" 84 | ] 85 | }, 86 | "subnets": [ 87 | { 88 | "name": "[variables('subnetName')]", 89 | "properties": { 90 | "addressPrefix": "[variables('subnetPrefix')]" 91 | } 92 | } 93 | ] 94 | } 95 | }, 96 | { 97 | "apiVersion": "2015-05-01-preview", 98 | "type": "Microsoft.Network/networkInterfaces", 99 | "name": "[variables('nicName')]", 100 | "location": "[variables('location')]", 101 | "dependsOn": [ 102 | "[concat('Microsoft.Network/publicIPAddresses/', variables('publicIPAddressName'))]", 103 | "[concat('Microsoft.Network/virtualNetworks/', variables('virtualNetworkName'))]" 104 | ], 105 | "properties": { 106 | "ipConfigurations": [ 107 | { 108 | "name": "ipconfig1", 109 | "properties": { 110 | "privateIPAllocationMethod": "Dynamic", 111 | "publicIPAddress": { 112 | "id": "[resourceId('Microsoft.Network/publicIPAddresses',variables('publicIPAddressName'))]" 113 | }, 114 | "subnet": { 115 | "id": "[variables('subnetRef')]" 116 | } 117 | } 118 | } 119 | ] 120 | } 121 | }, 122 | { 123 | "apiVersion": "2015-06-15", 124 | "type": "Microsoft.Compute/virtualMachines", 125 | "name": "[parameters('vmName')]", 126 | "location": "[variables('location')]", 127 | "plan": { 128 | "name": "[variables('sku')]", 129 | "publisher": "[variables('imagePublisher')]", 130 | "product": "[variables('imageOffer')]" 131 | }, 132 | "tags": { 133 | "Application": "DataScience" 134 | }, 135 | "dependsOn": [ 136 | "[concat('Microsoft.Storage/storageAccounts/', variables('storageAccountName'))]", 137 | "[concat('Microsoft.Network/networkInterfaces/', variables('nicName'))]" 138 | ], 139 | "properties": { 140 | "hardwareProfile": { 141 | "vmSize": "[variables('vmSize')]" 142 | }, 143 | "osProfile": { 144 | "computerName": "[parameters('vmName')]", 145 | "adminUsername": "[parameters('adminUsername')]", 146 | "adminPassword": "[parameters('adminPassword')]" 147 | }, 148 | "storageProfile": { 149 | "imageReference": { 150 | "publisher": "[variables('imagePublisher')]", 151 | "offer": "[variables('imageOffer')]", 152 | "sku": "[variables('sku')]", 153 | "version": "[variables('version')]" 154 | }, 155 | "osDisk": { 156 | "name": "osdisk", 157 | "vhd": { 158 | "uri": "[concat('http://',variables('storageAccountName'),'.blob.core.windows.net/',variables('vmStorageAccountContainerName'),'/',variables('OSDiskName'), parameters('vmName'), '.vhd')]" 159 | }, 160 | "caching": "ReadWrite", 161 | "createOption": "FromImage" 162 | } 163 | }, 164 | "networkProfile": { 165 | "networkInterfaces": [ 166 | { 167 | "id": "[resourceId('Microsoft.Network/networkInterfaces',variables('nicName'))]" 168 | } 169 | ] 170 | }, 171 | "diagnosticsProfile": { 172 | "bootDiagnostics": { 173 | "enabled": "true", 174 | "storageUri": "[concat('http://',variables('storageAccountName'),'.blob.core.windows.net')]" 175 | } 176 | } 177 | } 178 | } 179 | ], 180 | "outputs": { 181 | "vmUrl": { 182 | "value": "[concat('https://ms.portal.azure.com/#resource/subscriptions/', subscription().subscriptionId, '/resourceGroups/', resourceGroup().name, '/providers/Microsoft.Compute/virtualMachines/', parameters('vmName'))]", 183 | "type": "string" 184 | }, 185 | "vmFqdn": { 186 | "value": "[reference( variables('publicIPAddressName')).dnsSettings.fqdn]", 187 | "type": "string" 188 | }, 189 | "vmAdminUsername": { 190 | "value": "[parameters('adminUsername')]", 191 | "type": "string" 192 | }, 193 | "vmAdminPassword": { 194 | "value": "[parameters('adminPassword')]", 195 | "type": "string" 196 | }, 197 | "sqlServerName": { 198 | "value": "[parameters('vmName')]", 199 | "type": "string" 200 | } 201 | } 202 | } 203 | -------------------------------------------------------------------------------- /ArmTemplates/fraud-detection_arm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentTemplate.json#", 3 | "contentVersion": "1.0.0.0", 4 | "parameters": { 5 | "adminUsername": { 6 | "type": "string", 7 | "metadata": { 8 | "description": "Username for the Virtual Machine." 9 | } 10 | }, 11 | "adminPassword": { 12 | "type": "securestring", 13 | "metadata": { 14 | "description": "Password for the Virtual Machine. The password must be 8 or more characters long. It must contain 1+ uppercase character(s), 1+ lowercase character(s), 1+ number(s), and 1+ special character(s) from ~!@#$%^&()-_+=|<>\/;:,." 15 | } 16 | }, 17 | "vmSize": { 18 | "type": "string", 19 | "defaultValue": "Standard_DS4_v2", 20 | "allowedValues": [ 21 | "Basic_A0", 22 | "Basic_A1", 23 | "Basic_A2", 24 | "Basic_A3", 25 | "Basic_A4", 26 | "Standard_A0", 27 | "Standard_A1", 28 | "Standard_A2", 29 | "Standard_A3", 30 | "Standard_A4", 31 | "Standard_A5", 32 | "Standard_A6", 33 | "Standard_A7", 34 | "Standard_A8", 35 | "Standard_A9", 36 | "Standard_A10", 37 | "Standard_A11", 38 | "Standard_A1_v2", 39 | "Standard_A2_v2", 40 | "Standard_A4_v2", 41 | "Standard_A8_v2", 42 | "Standard_A2m_v2", 43 | "Standard_A4m_v2", 44 | "Standard_A8m_v2", 45 | "Standard_B1s", 46 | "Standard_B1ms", 47 | "Standard_B2s", 48 | "Standard_B2ms", 49 | "Standard_B4ms", 50 | "Standard_B8ms", 51 | "Standard_D1", 52 | "Standard_D2", 53 | "Standard_D3", 54 | "Standard_D4", 55 | "Standard_D11", 56 | "Standard_D12", 57 | "Standard_D13", 58 | "Standard_D14", 59 | "Standard_D1_v2", 60 | "Standard_D2_v2", 61 | "Standard_D3_v2", 62 | "Standard_D4_v2", 63 | "Standard_D5_v2", 64 | "Standard_D2_v3", 65 | "Standard_D4_v3", 66 | "Standard_D8_v3", 67 | "Standard_D16_v3", 68 | "Standard_D32_v3", 69 | "Standard_D64_v3", 70 | "Standard_D2s_v3", 71 | "Standard_D4s_v3", 72 | "Standard_D8s_v3", 73 | "Standard_D16s_v3", 74 | "Standard_D32s_v3", 75 | "Standard_D64s_v3", 76 | "Standard_D11_v2", 77 | "Standard_D12_v2", 78 | "Standard_D13_v2", 79 | "Standard_D14_v2", 80 | "Standard_D15_v2", 81 | "Standard_DS1", 82 | "Standard_DS2", 83 | "Standard_DS3", 84 | "Standard_DS4", 85 | "Standard_DS11", 86 | "Standard_DS12", 87 | "Standard_DS13", 88 | "Standard_DS14", 89 | "Standard_DS1_v2", 90 | "Standard_DS2_v2", 91 | "Standard_DS3_v2", 92 | "Standard_DS4_v2", 93 | "Standard_DS5_v2", 94 | "Standard_DS11_v2", 95 | "Standard_DS12_v2", 96 | "Standard_DS13_v2", 97 | "Standard_DS14_v2", 98 | "Standard_DS15_v2", 99 | "Standard_DS13-4_v2", 100 | "Standard_DS13-2_v2", 101 | "Standard_DS14-8_v2", 102 | "Standard_DS14-4_v2", 103 | "Standard_E2_v3", 104 | "Standard_E4_v3", 105 | "Standard_E8_v3", 106 | "Standard_E16_v3", 107 | "Standard_E32_v3", 108 | "Standard_E64_v3", 109 | "Standard_E2s_v3", 110 | "Standard_E4s_v3", 111 | "Standard_E8s_v3", 112 | "Standard_E16s_v3", 113 | "Standard_E32s_v3", 114 | "Standard_E64s_v3", 115 | "Standard_E32-16_v3", 116 | "Standard_E32-8s_v3", 117 | "Standard_E64-32s_v3", 118 | "Standard_E64-16s_v3", 119 | "Standard_F1", 120 | "Standard_F2", 121 | "Standard_F4", 122 | "Standard_F8", 123 | "Standard_F16", 124 | "Standard_F1s", 125 | "Standard_F2s", 126 | "Standard_F4s", 127 | "Standard_F8s", 128 | "Standard_F16s", 129 | "Standard_F2s_v2", 130 | "Standard_F4s_v2", 131 | "Standard_F8s_v2", 132 | "Standard_F16s_v2", 133 | "Standard_F32s_v2", 134 | "Standard_F64s_v2", 135 | "Standard_F72s_v2", 136 | "Standard_G1", 137 | "Standard_G2", 138 | "Standard_G3", 139 | "Standard_G4", 140 | "Standard_G5", 141 | "Standard_GS1", 142 | "Standard_GS2", 143 | "Standard_GS3", 144 | "Standard_GS4", 145 | "Standard_GS5", 146 | "Standard_GS4-8", 147 | "Standard_GS4-4", 148 | "Standard_GS5-16", 149 | "Standard_GS5-8", 150 | "Standard_H8", 151 | "Standard_H16", 152 | "Standard_H8m", 153 | "Standard_H16m", 154 | "Standard_H16r", 155 | "Standard_H16mr", 156 | "Standard_L4s", 157 | "Standard_L8s", 158 | "Standard_L16s", 159 | "Standard_L32s", 160 | "Standard_M64s", 161 | "Standard_M64ms", 162 | "Standard_M128s", 163 | "Standard_M128ms", 164 | "Standard_M64-32ms", 165 | "Standard_M64-16ms", 166 | "Standard_M128-64ms", 167 | "Standard_M128-32ms", 168 | "Standard_NC6", 169 | "Standard_NC12", 170 | "Standard_NC24", 171 | "Standard_NC24r", 172 | "Standard_NC6s_v2", 173 | "Standard_NC12s_v2", 174 | "Standard_NC24s_v2", 175 | "Standard_NC24rs_v2", 176 | "Standard_NC6s_v3", 177 | "Standard_NC12s_v3", 178 | "Standard_NC24s_v3", 179 | "Standard_NC24rs_v3", 180 | "Standard_ND6s", 181 | "Standard_ND12s", 182 | "Standard_ND24s", 183 | "Standard_ND24rs", 184 | "Standard_NV6", 185 | "Standard_NV12", 186 | "Standard_NV24" 187 | ], 188 | "metadata": { 189 | "description": "Select a SKU for the virtual machine. Recommended SKU is 'Standard_DS4_v2'" 190 | } 191 | } 192 | }, 193 | "variables": { 194 | "location": "[resourceGroup().location]", 195 | "vmName": "[toLower(concat('co', uniqueString(resourceGroup().id)))]" 196 | }, 197 | "resources": [ 198 | { 199 | "apiVersion": "2017-05-10", 200 | "name": "dsvmTemplate", 201 | "type": "Microsoft.Resources/deployments", 202 | "properties": { 203 | "mode": "incremental", 204 | "templateLink": { 205 | "uri": "https://raw.githubusercontent.com/Microsoft/r-server-fraud-detection/master/ArmTemplates/dsvm_arm.json", 206 | "contentVersion": "1.0.0.0" 207 | }, 208 | "parameters": { 209 | "adminUsername": { "value": "[parameters('adminUsername')]" }, 210 | "adminPassword": { "value": "[parameters('adminPassword')]" }, 211 | "vmName": { "value": "[variables('vmName')]" }, 212 | "vmSize": { "value": "[parameters('vmSize')]" } 213 | } 214 | } 215 | }, 216 | { 217 | "apiVersion": "2017-05-10", 218 | "name": "configureTemplate", 219 | "type": "Microsoft.Resources/deployments", 220 | "dependsOn": [ 221 | "dsvmTemplate" 222 | ], 223 | "properties": { 224 | "mode": "incremental", 225 | "templateLink": { 226 | "uri": "https://raw.githubusercontent.com/Microsoft/r-server-fraud-detection/master/ArmTemplates/configure_arm.json", 227 | "contentVersion": "1.0.0.0" 228 | }, 229 | "parameters": { 230 | "adminUsername": { "value": "[parameters('adminUsername')]" }, 231 | "adminPassword": { "value": "[parameters('adminPassword')]" }, 232 | "vmName": { "value": "[variables('vmName')]" } 233 | } 234 | } 235 | } 236 | ], 237 | "outputs": {} 238 | } 239 | -------------------------------------------------------------------------------- /ArmTemplates/fraud-detection_hdi_arm.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://schema.management.azure.com/schemas/2014-04-01-preview/deploymentTemplate.json#", 3 | "contentVersion": "0.9.0.0", 4 | "parameters": { 5 | "clusterName": { 6 | "type": "string", 7 | "metadata": { 8 | "description": "The name of the HDInsight cluster to create." 9 | } 10 | }, 11 | "clusterLoginUserName": { 12 | "type": "string", 13 | "defaultValue": "admin", 14 | "metadata": { 15 | "description": "These credentials can be used to submit jobs to the cluster and to log into cluster dashboards." 16 | } 17 | }, 18 | "clusterLoginPassword": { 19 | "type": "securestring", 20 | "metadata": { 21 | "description": "The password must be at least 10 characters in length and must contain at least one digit, one non-alphanumeric character, and one upper or lower case letter." 22 | } 23 | }, 24 | "location": { 25 | "type": "string", 26 | "defaultValue": "westus2", 27 | "metadata": { 28 | "description": "The location where all azure resources will be deployed." 29 | } 30 | }, 31 | "clusterWorkerNodeCount": { 32 | "type": "int", 33 | "defaultValue": 3, 34 | "metadata": { 35 | "description": "The number of nodes in the HDInsight cluster. Make sure to set this to at least 3." 36 | } 37 | }, 38 | "sshUserName": { 39 | "type": "string", 40 | "defaultValue": "sshuser", 41 | "metadata": { 42 | "description": "These credentials can be used to remotely access the cluster." 43 | } 44 | }, 45 | "sshPassword": { 46 | "type": "securestring", 47 | "metadata": { 48 | "description": "The password must be at least 10 characters in length and must contain at least one digit, one non-alphanumeric character, and one upper or lower case letter." 49 | } 50 | } 51 | }, 52 | "variables": { 53 | "defaultStorageAccount": { 54 | "name": "[uniqueString(resourceGroup().id)]", 55 | "type": "Standard_LRS" 56 | } 57 | }, 58 | "resources": [ 59 | { 60 | "apiVersion": "2015-03-01-preview", 61 | "name": "[parameters('clusterName')]", 62 | "type": "Microsoft.HDInsight/clusters", 63 | "location": "[parameters('location')]", 64 | "dependsOn": [ 65 | "[concat('Microsoft.Storage/storageAccounts/',variables('defaultStorageAccount').name)]" 66 | ], 67 | "properties": { 68 | "clusterVersion": "3.6", 69 | "osType": "Linux", 70 | "tier": "standard", 71 | "clusterDefinition": { 72 | "kind": "MLSERVICES", 73 | "configurations": { 74 | "gateway": { 75 | "restAuthCredential.isEnabled": true, 76 | "restAuthCredential.username": "[parameters('clusterLoginUserName')]", 77 | "restAuthCredential.password": "[parameters('clusterLoginPassword')]" 78 | }, 79 | "rserver": { 80 | "rstudio": true 81 | } 82 | } 83 | }, 84 | "storageProfile": { 85 | "storageaccounts": [ 86 | { 87 | "name": "[replace(replace(concat(reference(concat('Microsoft.Storage/storageAccounts/', variables('defaultStorageAccount').name), '2016-01-01').primaryEndpoints.blob),'https:',''),'/','')]", 88 | "isDefault": true, 89 | "container": "[parameters('clusterName')]", 90 | "key": "[listKeys(resourceId('Microsoft.Storage/storageAccounts', variables('defaultStorageAccount').name), '2016-01-01').keys[0].value]" 91 | } 92 | ] 93 | }, 94 | "computeProfile": { 95 | "roles": [ 96 | { 97 | "autoscale": null, 98 | "name": "headnode", 99 | "minInstanceCount": 1, 100 | "targetInstanceCount": 2, 101 | "hardwareProfile": { 102 | "vmSize": "Standard_D12_V2" 103 | }, 104 | "osProfile": { 105 | "linuxOperatingSystemProfile": { 106 | "username": "[parameters('sshUserName')]", 107 | "password": "[parameters('sshPassword')]" 108 | } 109 | }, 110 | "virtualNetworkProfile": null, 111 | "scriptActions": [] 112 | }, 113 | { 114 | "autoscale": null, 115 | "name": "workernode", 116 | "targetInstanceCount": "[parameters('clusterWorkerNodeCount')]", 117 | "hardwareProfile": { 118 | "vmSize": "Standard_D4_V2" 119 | }, 120 | "osProfile": { 121 | "linuxOperatingSystemProfile": { 122 | "username": "[parameters('sshUserName')]", 123 | "password": "[parameters('sshPassword')]" 124 | } 125 | }, 126 | "virtualNetworkProfile": null, 127 | "scriptActions": [] 128 | }, 129 | { 130 | "autoscale": null, 131 | "name": "zookeepernode", 132 | "minInstanceCount": 1, 133 | "targetInstanceCount": 3, 134 | "hardwareProfile": { 135 | "vmSize": "Standard_A2_V2" 136 | }, 137 | "osProfile": { 138 | "linuxOperatingSystemProfile": { 139 | "username": "[parameters('sshUserName')]", 140 | "password": "[parameters('sshPassword')]" 141 | } 142 | }, 143 | "virtualNetworkProfile": null, 144 | "scriptActions": [] 145 | }, 146 | { 147 | "autoscale": null, 148 | "name": "edgenode", 149 | "minInstanceCount": 1, 150 | "targetInstanceCount": 1, 151 | "hardwareProfile": { 152 | "vmSize": "Standard_D4_V2" 153 | }, 154 | "osProfile": { 155 | "linuxOperatingSystemProfile": { 156 | "username": "[parameters('sshUserName')]", 157 | "password": "[parameters('sshPassword')]" 158 | } 159 | }, 160 | "virtualNetworkProfile": null, 161 | "scriptActions": [ 162 | { 163 | "name": "lcrsetup", 164 | "uri": "https://raw.githubusercontent.com/Microsoft/r-server-fraud-detection/master/Resources/ActionScripts/hdisetup.sh", 165 | "parameters": "", 166 | "isHeadNode": false, 167 | "isWorkerNode": false, 168 | "isPersisted": true, 169 | "isZookeeperNode": false, 170 | "isEdgeNode": true, 171 | "applicationName": null 172 | } 173 | ] 174 | } 175 | ] 176 | } 177 | } 178 | }, 179 | { 180 | "type": "Microsoft.Storage/storageAccounts", 181 | "name": "[variables('defaultStorageAccount').name]", 182 | "apiVersion": "2015-05-01-preview", 183 | "location": "[parameters('location')]", 184 | "properties": { 185 | "accountType": "Standard_LRS" 186 | } 187 | } 188 | ] 189 | } -------------------------------------------------------------------------------- /Data/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Fraud Detection 4 | ## Implemented on SQL Server 2016 R Services and HDInsight Spark 5 | 6 | For all documentation, visit the [Fraud Detection website](https://microsoft.github.io/r-server-fraud-detection/). 7 | 8 | **NOTE:** Please don't use "Download ZIP" to get this repository, as it will change the line endings in the data files. Use "git clone" to get a local copy of this repository. 9 | 10 | # Contributing 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /R/Fraud.rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /R/Fraud.rxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 69478067-7c7d-458b-9aa5-299a620e57e3 5 | 6 | 7 | 14.0 8 | Debug 9 | AnyCPU 10 | 11 | 12 | script.R 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /R/Fraud.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{DA7A21FA-8162-4350-AD77-A8D1B671F3ED}") = "Fraud", "Fraud.rxproj", "{69478067-7C7D-458B-9AA5-299A620E57E3}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {69478067-7C7D-458B-9AA5-299A620E57E3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {69478067-7C7D-458B-9AA5-299A620E57E3}.Debug|Any CPU.Build.0 = Debug|Any CPU 16 | {69478067-7C7D-458B-9AA5-299A620E57E3}.Release|Any CPU.ActiveCfg = Release|Any CPU 17 | {69478067-7C7D-458B-9AA5-299A620E57E3}.Release|Any CPU.Build.0 = Release|Any CPU 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /R/modeling_main.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following: 3 | ## 1. Specify parameters: Full path of the three input tables, SQL Server database name, User ID, Password, and Server Name. 4 | ## 2. Source the different scripts for the Development Stage. 5 | 6 | ## Input : Full path of the three input tables, database name, User ID, Password, and Server Name. 7 | ## Output: Trained model and Predictions on the testing set as well as performance metrics. 8 | 9 | ########################################################################################################################################## 10 | 11 | # Load library. 12 | library(RevoScaleR) 13 | 14 | # Set the working directory to the R scripts location. 15 | # setwd() 16 | 17 | ########################################################################################################################################## 18 | ## SPECIFY INPUTS 19 | ########################################################################################################################################## 20 | 21 | # Data sets full path. The paths below work if the working directory is set to the R scripts location. 22 | Untagged_Transactions <- "../Data/Untagged_Transactions.csv" 23 | Account_Info <- "../Data/Account_Info.csv" 24 | #Fraud <- "../Data/Fraud.csv" 25 | Fraud_Transactions <- "../Data/Fraud_Transactions.csv" 26 | 27 | 28 | # Creating the connection string. Specify: 29 | ## Database name. If it already exists, tables will be overwritten. If not, it will be created. 30 | ## Server name. If conecting remotely to the DSVM, the full DNS address should be used with the port number 1433 (which should be enabled) 31 | db_name <- "FraudR" 32 | server <- "localhost" 33 | connection_string <- sprintf("Driver=SQL Server;Server=%s;Database=%s;Trusted_Connection=TRUE", server, db_name) 34 | # Above connection is set up to use your Windows credentials 35 | # To use an id/password instead, add them in the lines below and uncomment 36 | # user_id <- "XXXYOURID" 37 | # password <- "XXXYOURPW" 38 | # connection_string <- sprintf("Driver=SQL Server;Server=%s;Database=%s;UID=%s;PWD=%s", server, db_name, user_id, password) 39 | 40 | ############################################################################################################################## 41 | ## Database Creation. 42 | ############################################################################################################################## 43 | 44 | # Open an Odbc connection with SQL Server master database only to create a new database with the rxExecuteSQLDDL function. 45 | 46 | connection_string_master <- sprintf("Driver=SQL Server;Server=%s;Database=master;Trusted_Connection=TRUE", server) 47 | # Or with id/password: 48 | # connection_string_master <- sprintf("Driver=SQL Server;Server=%s;Database=master;UID=%s;PWD=%s", server, user_id, password) 49 | 50 | outOdbcDS_master <- RxOdbcData(table = "Default_Master", connectionString = connection_string_master) 51 | rxOpen(outOdbcDS_master, "w") 52 | 53 | # Create database if applicable. 54 | query <- sprintf( "if not exists(SELECT * FROM sys.databases WHERE name = '%s') CREATE DATABASE %s;", db_name, db_name) 55 | rxExecuteSQLDDL(outOdbcDS_master, sSQLString = query) 56 | 57 | #Create SQLRUserGroup 58 | 59 | query <- sprintf("USE [%s] CREATE USER [dsvm\\SQLRUserGroup] FOR LOGIN [dsvm\\SQLRUserGroup]", db_name) 60 | rxExecuteSQLDDL(outOdbcDS_master, sSQLString = query) 61 | 62 | # Close Obdc connection to master database. 63 | rxClose(outOdbcDS_master) 64 | 65 | ############################################################################################################################## 66 | ## Odbc connection and SQL Compute Context. 67 | ############################################################################################################################## 68 | 69 | # Open an Obdc connection with the SQL Server database that will store the modeling tables. (Only used for rxExecuteSQLddl) 70 | outOdbcDS <- RxOdbcData(table = "Default", connectionString = connection_string) 71 | rxOpen(outOdbcDS, "w") 72 | 73 | # Define SQL Compute Context for in-database computations. 74 | sql <- RxInSqlServer(connectionString = connection_string) 75 | 76 | ############################################################################################################################## 77 | ## Modeling Pipeline. 78 | ############################################################################################################################## 79 | 80 | # Step 1: Tagging. 81 | print("Step 1: Tagging.") 82 | source("./step1_tagging.R") 83 | 84 | # Step 2: Splitting & Preprocessing the training set. 85 | print("Step 2: Splitting and Preprocessing the training set.") 86 | source("./step2_splitting_preprocessing.R") 87 | 88 | # Step 3: Feature Engineering. 89 | print("Step 3: Feature Engineering on the training set.") 90 | source("./step3_feature_engineering.R") 91 | 92 | # Step 4: training, preprocessing and feature engineering on the testing set, scoring and evaluation of GBT. 93 | print("Step 4: Training, Scoring and Evaluating.") 94 | source("./step4_training_evaluation.R") 95 | 96 | # Close the Obdc connection used for rxExecuteSQLddl functions. 97 | rxClose(outOdbcDS) 98 | 99 | ########################################################################################################################################## 100 | ## Function to get the top n rows of a table stored on SQL Server. 101 | ## You can execute this function at any time during your progress by removing the comment "#", and inputting: 102 | ## - the table name. 103 | ## - the number of rows you want to display. 104 | ########################################################################################################################################## 105 | 106 | display_head <- function(table_name, n_rows){ 107 | table_sql <- RxSqlServerData(sqlQuery = sprintf("SELECT TOP(%s) * FROM %s", n_rows, table_name), connectionString = connection_string) 108 | table <- rxImport(table_sql) 109 | print(table) 110 | } 111 | 112 | # table_name <- "insert_table_name" 113 | # n_rows <- 10 114 | # display_head(table_name, n_rows) 115 | 116 | 117 | -------------------------------------------------------------------------------- /R/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Fraud Detection 4 | ## Implemented on SQL Server 2016 R Services and HDInsight Spark 5 | 6 | For all documentation, visit the [Fraud Detection website](https://microsoft.github.io/r-server-fraud-detection/). 7 | 8 | **NOTE:** Please don't use "Download ZIP" to get this repository, as it will change the line endings in the data files. Use "git clone" to get a local copy of this repository. 9 | 10 | # Contributing 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 13 | -------------------------------------------------------------------------------- /R/step2_splitting_preprocessing.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following : 3 | ## 1. Split the tagged data set into a Training and a Testing set. 4 | ## 2. Clean the training set and perform some preprocessing. 5 | 6 | ## Input : Tagged data set. 7 | ## Output: Training and Testing sets, and cleaned Training set Tagged_Training_Processed. 8 | 9 | ########################################################################################################################################## 10 | 11 | # Set the compute context to SQL. 12 | rxSetComputeContext(sql) 13 | 14 | ############################################################################################################################################# 15 | ## The block below will split the Tagged data set into a Training and a Testing set. 16 | ############################################################################################################################################ 17 | 18 | print("Randomly splitting into a training and a testing set...") 19 | 20 | # Create the Hash_Id table containing accountID hashed to 100 bins. 21 | # The advantage of using a hashing function for splitting is to: 22 | # - ensure that the same accountID ends up in the same split. 23 | # - permit repeatability of the experiment. 24 | rxExecuteSQLDDL(outOdbcDS, sSQLString = "DROP TABLE if exists Hash_Id;") 25 | 26 | rxExecuteSQLDDL(outOdbcDS, sSQLString = 27 | "SELECT accountID, ABS(CAST(CAST(HashBytes('MD5', accountID) AS VARBINARY(64)) AS BIGINT) % 100) AS hashCode 28 | INTO Hash_Id 29 | FROM Tagged ;") 30 | 31 | # Point to the training set. 32 | # At the same time, we remove: 33 | # - variables not used in the next steps (intermediate variables, variables not needed for the training, variables with only missing values). 34 | # - observations with labels equal to 2 (pre-fraud). 35 | # - observations where accountID, transactionID and transactionDateTime are missing. 36 | # - observations where the transaction amount in USD is negative. 37 | 38 | query_training <- "SELECT label, accountID, transactionID, transactionDateTime, isProxyIP, paymentInstrumentType, cardType, paymentBillingAddress, 39 | paymentBillingPostalCode, paymentBillingCountryCode, paymentBillingName, accountAddress, accountPostalCode, 40 | accountCountry, accountOwnerName, shippingAddress, transactionCurrencyCode,localHour, ipState, ipPostCode, 41 | ipCountryCode, browserLanguage, paymentBillingState, accountState, transactionAmountUSD, digitalItemCount, 42 | physicalItemCount, accountAge, paymentInstrumentAgeInAccount, numPaymentRejects1dPerUser, isUserRegistered, 43 | transactionDate, transactionTime 44 | FROM Tagged 45 | WHERE accountID IN (SELECT accountID from Hash_Id WHERE hashCode <= 70) 46 | AND label != 2 47 | AND accountID IS NOT NULL 48 | AND transactionID IS NOT NULL 49 | AND transactionDateTime IS NOT NULL 50 | AND cast(transactionAmountUSD as float) >= 0" 51 | 52 | Tagged_Training_sql <- RxSqlServerData(sqlQuery = query_training, connectionString = connection_string) 53 | 54 | ############################################################################################################################################ 55 | ## The block below will clean the Tagged data. 56 | ############################################################################################################################################ 57 | 58 | print("Cleaning and preprocessing the training set...") 59 | 60 | clean_preprocess <- function(input_data_query, output_sql_name){ 61 | 62 | # Detect variables with missing values. 63 | # No missing values in accountID, transactionID and transactionDateTime since we already filtered out missing values in the query above. 64 | # For rxSummary to give correct info on characters, stringsAsFactors = TRUE should be used in the pointer to the SQL Tagged_Training table. 65 | Tagged_Data_sql_stringsfactors <- RxSqlServerData(sqlQuery = input_data_query, connectionString = connection_string, stringsAsFactors = TRUE) 66 | var <- rxGetVarNames(Tagged_Data_sql_stringsfactors) 67 | formula <- as.formula(paste("~", paste(var, collapse = "+"))) 68 | summary <- rxSummary(formula, Tagged_Data_sql_stringsfactors, byTerm = TRUE) 69 | variables_NA <- summary$sDataFrame[summary$sDataFrame$MissingObs > 0, 1] 70 | variables_NA <- variables_NA[!variables_NA %in% c("accountID", "transactionID", "transactionDateTime", "transactionDate", "transactionTime")] 71 | 72 | # If no missing values, we will only preprocess the data. Otherwise, we clean and preprocess. 73 | if(length(variables_NA) == 0){ 74 | print("No missing values: only preprocessing will be performed.") 75 | } else{ 76 | print("Variables containing missing values are:") 77 | print(variables_NA) 78 | } 79 | 80 | # Function to replace missing values with 0. It will be wrapped into rxDataStep. 81 | preprocessing <- function(data) { 82 | data <- data.frame(data, stringsAsFactors = FALSE) 83 | 84 | # Replace missing values with 0 except for localHour with -99. 85 | if(length(var_with_NA) > 0){ 86 | for(i in 1:length(var_with_NA)){ 87 | row_na <- which(is.na(data[, var_with_NA[i]])) 88 | if(var_with_NA[i] == c("localHour")){ 89 | data[row_na, var_with_NA[i]] <- "-99" 90 | } else{ 91 | data[row_na, var_with_NA[i]] <- "0" 92 | } 93 | } 94 | } 95 | 96 | # Fix some data entries in isUserRegistered, which should be binary. 97 | row_na <- which(data[, c("isUserRegistered")] %in% as.character(seq(1, 9))) 98 | data[row_na, c("isUserRegistered")] <- "0" 99 | 100 | # Convert a few variables to numeric, replacing non-numeric entries with 0. a few other variables to fix some data entries. 101 | numeric_to_fix <- c("accountAge", "paymentInstrumentAgeInAccount", "numPaymentRejects1dPerUser", "transactionAmountUSD", 102 | "digitalItemCount", "physicalItemCount") 103 | for(i in 1:length(numeric_to_fix)){ 104 | data[, numeric_to_fix[i]] <- as.numeric(data[, numeric_to_fix[i]]) 105 | row_na <- which(is.na(as.numeric(data[, numeric_to_fix[i]]))) 106 | data[row_na, numeric_to_fix[i]] <- 0 107 | } 108 | return(data) 109 | } 110 | 111 | # Input and Output pointers. 112 | Input_sql <- RxSqlServerData(sqlQuery = input_data_query, connectionString = connection_string) 113 | Output_sql <- RxSqlServerData(table = output_sql_name, connectionString = connection_string) 114 | 115 | # We drop the output if it already exists as a view in case the SQL SP was executed in the same database before. 116 | rxExecuteSQLDDL(outOdbcDS, sSQLString = sprintf("IF OBJECT_ID ('%s', 'V') IS NOT NULL DROP VIEW %s ;", 117 | output_sql_name, output_sql_name)) 118 | 119 | # Perform the data cleaning with rxDataStep. 120 | ## To preserve the type of transactionDateTime, we recreate it. 121 | rxDataStep(inData = Input_sql, 122 | outFile = Output_sql, 123 | overwrite = TRUE, 124 | rowsPerRead = 200000, 125 | transformFunc = preprocessing, 126 | transformObjects = list(var_with_NA = variables_NA), 127 | transforms = list( 128 | transactionDateTime = as.character(as.POSIXct(paste(transactionDate, sprintf("%06d", as.numeric(transactionTime)), sep=""), format = "%Y%m%d %H%M%S", tz = "GMT")) 129 | )) 130 | 131 | } 132 | 133 | # Apply the preprocessing and cleaning to the training set. 134 | clean_preprocess(input_data_query = query_training, 135 | output_sql_name = "Tagged_Training_Processed") 136 | 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Fraud Detection 4 | Predict if an online purchase transaction is fraudulent. This is an important scenario in many industries, including retail and finance. 5 | 6 | ### Deploy to Azure on SQL Server 7 | [![Deploy to Azure (SQL Server)](https://raw.githubusercontent.com/Azure/Azure-CortanaIntelligence-SolutionAuthoringWorkspace/master/docs/images/DeployToAzure.PNG)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2FMicrosoft%2Fr-server-fraud-detection%2Fmaster%2FArmTemplates%2Ffraud-detection_arm.json) 8 | 9 | ## More samples and information 10 | > Discover more examples at [Microsoft Machine Learning Server](https://github.com/Microsoft/ML-Server) 11 | 12 | For all documentation, visit the [Fraud Detection website](https://microsoft.github.io/r-server-fraud-detection/). 13 | 14 | **NOTE:** Please don't use "Download ZIP" to get this repository, as it will change the line endings in the data files. Use "git clone" to get a local copy of this repository. 15 | 16 | ## Contributing 17 | 18 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 19 | -------------------------------------------------------------------------------- /RSparkCluster/copy_dev_to_prod.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will define a function, copy_to_prod, that: 3 | ## 1. Cleans up an already existing directory or create it on the edge node, ProdModelDir. 4 | ## 2. Copies to that directory: Summary statistics, Bins, Logistic Regression model, etc. from the Development directory. 5 | 6 | ## Input : DevModelDir: Path to the directory on the edge node storing the summary statistics, bins, model, etc. 7 | ## ProdModelDir: Path o the directory where the contents of DevModelDir should be copied. 8 | ## Output: ProdModelDir with data trasferred from DevModelDir. 9 | 10 | 11 | ## It should be applied: 12 | ## a) If running the Production stage for the first time. 13 | ## b) If you want to run the Production stage with a newly trained model; the older one will be overwritten. 14 | ########################################################################################################################################## 15 | 16 | copy_dev_to_prod <- function(DevModelDir, ProdModelDir){ 17 | 18 | # Clean or create a new directory in the Prodution directory. 19 | if(dir.exists(ProdModelDir)){ 20 | system(paste("rm -rf ", ProdModelDir, sep = "")) # remove the directory if exists 21 | system(paste("mkdir -p -m 777 ", ProdModelDir, sep = "")) # create a new directory 22 | } else { 23 | system(paste("mkdir -p -m 777 ", ProdModelDir, sep = "")) # make new directory if doesn't exist 24 | } 25 | 26 | # Copy the model, statistics and other data from the Development directory to the Production directory. 27 | system(paste("cp ", DevModelDir, "/*.rds ", ProdModelDir, sep = "")) 28 | 29 | } 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /RSparkCluster/development_main.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following: 3 | ## 1. Specify parameters for main function. 4 | ## 2. Define the main function for development. 5 | ## 3. Invoke the main function. 6 | 7 | ## Input : 1. Full path of the three input tables on HDFS. 8 | ## 2. Working directories on local edge node and HDFS 9 | ## 3. Stage: "Dev" for development. 10 | ## Output: The evaluation metrics of the model. 11 | ## Tables and model to be used for Production or Web Scoring are copied to the Production directory. 12 | 13 | ########################################################################################################################################## 14 | 15 | # Current working directory should be set with setwd() to the location of the .R files. 16 | 17 | ########################################################################################################################################## 18 | ## Open Spark Connection and load RevoScaleR library. 19 | ########################################################################################################################################## 20 | 21 | rxSparkConnect(consoleOutput = TRUE, reset = TRUE) 22 | library(RevoScaleR) 23 | 24 | ########################################################################################################################################## 25 | ## Data sets full path 26 | ########################################################################################################################################## 27 | 28 | # Write the full path to the 3 data sets. 29 | Untagged_Transactions <- "/Fraud/Data/untaggedTransactions.csv" 30 | Account_Info <- "/Fraud/Data/accountInfo.csv" 31 | Fraud_Transactions <- "/Fraud/Data/fraudTransactions.csv" 32 | 33 | ########################################################################################################################################## 34 | ## Directories 35 | ########################################################################################################################################## 36 | 37 | # Local (edge node) working directory. We assume it already exists. 38 | LocalWorkDir <- paste("/var/RevoShare/", Sys.info()[["user"]], "/Fraud/dev", sep="") 39 | #dir.create(LocalWorkDir, recursive = TRUE) 40 | 41 | # HDFS directory for user calculation. We assume it already exists. 42 | HDFSWorkDir <- paste("/",Sys.info()[["user"]],"/Fraud/dev", sep="") 43 | #rxHadoopMakeDir(HDFSWorkDir) 44 | 45 | ############################################################################################################################## 46 | ## Define main function 47 | ############################################################################################################################## 48 | 49 | ## The user should replace the directory in "source" function with the directory of his own. 50 | ## The directory should be the full path containing the source scripts. 51 | 52 | fraud_dev <- function(Untagged_Transactions, 53 | Account_Info, 54 | Fraud_Transactions, 55 | LocalWorkDir, 56 | HDFSWorkDir, 57 | Stage = "Dev", 58 | update_prod_flag = 1){ 59 | 60 | # step0: intermediate directories creation. 61 | print("Creating Intermediate Directories on Local and HDFS...") 62 | source("./step0_directories_creation.R") 63 | 64 | ## Define and create the directory where Risk tables, models etc. will be saved in the Development stage. 65 | LocalModelsDir <- file.path(LocalWorkDir, "model") 66 | if(dir.exists(LocalModelsDir)){ 67 | system(paste("rm -rf ",LocalModelsDir,"/*", sep = "")) # clean up the directory if exists 68 | } else { 69 | dir.create(LocalModelsDir, recursive = TRUE) # make new directory if doesn't exist 70 | } 71 | 72 | # step1: merging with account info 73 | source("./step1_merge_account_info.R") 74 | print("Step 1: Merging with account info...") 75 | merge_account_info(Untagged_Transactions = Untagged_Transactions, 76 | Account_Info = Account_Info, 77 | HDFSWorkDir = HDFSWorkDir, 78 | Stage = Stage) 79 | 80 | # step2: tagging 81 | source("./step2_tagging.R") 82 | print("Step 2: Tagging...") 83 | tagging(Input_Hive_Table = "UntaggedTransactionsAccountUnique", 84 | Fraud_Transactions = Fraud_Transactions, 85 | HDFSWorkDir = HDFSWorkDir) 86 | 87 | # step3: splitting 88 | print("Step3: Splitting...") 89 | source("./step3_splitting.R") 90 | 91 | # step4: preprocessing 92 | print("Step4: Preprocessing...") 93 | source("./step4_preprocessing.R") 94 | preprocess(HDFSWorkDir = HDFSWorkDir, 95 | HiveTable = "TaggedTraining") 96 | preprocess(HDFSWorkDir = HDFSWorkDir, 97 | HiveTable = "TaggedTesting") 98 | 99 | # step5: creating risk tables 100 | print("Step5: Creating risk tables...") 101 | source("./step5_create_risk_tables.R") 102 | create_risk_tables(LocalWorkDir = LocalWorkDir, 103 | HDFSWorkDir = HDFSWorkDir, 104 | HiveTable = "TaggedTrainingProcessed", 105 | smooth1 = 10, 106 | smooth2 = 100) 107 | 108 | # step6: feature engineering 109 | print("Step6: Feature Engineering...") 110 | source("./step6_feature_engineering.R") 111 | feature_engineering(LocalWorkDir = LocalWorkDir, 112 | HDFSWorkDir = HDFSWorkDir, 113 | HiveTable = "TaggedTrainingProcessed", 114 | Stage = Stage) 115 | feature_engineering(LocalWorkDir = LocalWorkDir, 116 | HDFSWorkDir = HDFSWorkDir, 117 | HiveTable = "TaggedTestingProcessed", 118 | Stage = Stage) 119 | 120 | 121 | # step7: training 122 | print("Step7: Training...") 123 | source("./step7_training.R") 124 | training(HDFSWorkDir = HDFSWorkDir, 125 | LocalWorkDir = LocalWorkDir, 126 | Input_Data_Xdf = "TaggedTrainingProcessedFeatures") 127 | 128 | # copy risk tables, model object to production folder if update_prod_flag = 1 129 | if (update_prod_flag == 1){ 130 | # Production directory that will hold the development data. 131 | ProdModelDir <- paste("/var/RevoShare/", Sys.info()[["user"]], "/Fraud/prod/model/", sep="") 132 | # Development directory that holds data to be used in Production. 133 | DevModelDir <- LocalModelsDir 134 | 135 | source("./copy_dev_to_prod.R") 136 | copy_dev_to_prod(DevModelDir, ProdModelDir) 137 | } 138 | 139 | # step8: prediction 140 | print("Step8: Prediction...") 141 | source("./step8_prediction.R") 142 | prediction(HDFSWorkDir = HDFSWorkDir, 143 | LocalWorkDir = LocalWorkDir, 144 | Input_Data_Xdf = "TaggedTestingProcessedFeatures", 145 | Stage = Stage) 146 | 147 | # step9: evaluation 148 | print("Step9: Evaluation...") 149 | source("./step9_evaluation.R") 150 | evaluation(HDFSWorkDir = HDFSWorkDir, 151 | Scored_Data_Xdf = "PredictScore") 152 | } 153 | 154 | ############################################################################################################################## 155 | ## Apply the main function 156 | ############################################################################################################################## 157 | 158 | fraud_dev (Untagged_Transactions = Untagged_Transactions, 159 | Account_Info = Account_Info, 160 | Fraud_Transactions = Fraud_Transactions, 161 | LocalWorkDir = LocalWorkDir, 162 | HDFSWorkDir = HDFSWorkDir, 163 | Stage = "Dev", 164 | update_prod_flag = 1) 165 | -------------------------------------------------------------------------------- /RSparkCluster/production_main.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following: 3 | ## 1. Specify parameters for main function. 4 | ## 2. Define the main function for production batch scoring. 5 | ## 3. Invoke the main function. 6 | 7 | ## Input : 1. Full path of the two input tables on HDFS (for scoring with Spark) 8 | ## OR the two tables as data frames (for in-memory scoring). 9 | ## 2. Working directories on local edge node and HDFS. 10 | ## 3. Stage: "Prod" for batch scoring. 11 | ## Output: The directory on HDFS which contains the Scores (Spark version) or The Scores table (in-memory version). 12 | 13 | ########################################################################################################################################## 14 | 15 | ########################################################################################################################################## 16 | ## Load the RevoScaleR library and Open Spark Connection 17 | ########################################################################################################################################## 18 | 19 | library(RevoScaleR) 20 | rxSparkConnect(consoleOutput = TRUE, reset = TRUE) 21 | 22 | ########################################################################################################################################## 23 | ## Directories 24 | ########################################################################################################################################## 25 | 26 | # Local (edge node) working directory. We assume it already exists. 27 | LocalWorkDir <- paste("/var/RevoShare/", Sys.info()[["user"]], "/Fraud/prod", sep="") 28 | #dir.create(LocalWorkDir, recursive = TRUE) 29 | 30 | # HDFS directory for user calculation. We assume it already exists. 31 | HDFSWorkDir <- paste("/",Sys.info()[["user"]],"/Fraud/prod", sep="") 32 | #rxHadoopMakeDir(HDFSWorkDir) 33 | 34 | # Current working directory should be set with setwd() to the location of the .R files. 35 | 36 | ########################################################################################################################################## 37 | ## Data sets full path 38 | ########################################################################################################################################## 39 | 40 | # Paths to the input data sets on HDFS. 41 | Untagged_Transactions_str <- "/Fraud/Data/untaggedTransactions_Prod.csv" 42 | Account_Info_str <- "/Fraud/Data/accountInfo.csv" 43 | 44 | # Import the .csv files as data frames. stringsAsFactors = F to avoid converting the ID variables to factors, which takes a very long time. 45 | Untagged_Transactions_df <- rxImport(RxTextData(file = Untagged_Transactions_str, fileSystem = RxHdfsFileSystem()), stringsAsFactors = F) 46 | Account_Info_df <- rxImport(RxTextData(file = Account_Info_str, fileSystem = RxHdfsFileSystem()), stringsAsFactors = F) 47 | 48 | 49 | ############################################################################################################################## 50 | ## Define main function 51 | ############################################################################################################################## 52 | 53 | ## If Untagged_Transactions and Account_Info are data frames, the web scoring is done in_memory. 54 | ## Use paths to csv files on HDFS for large data sets that do not fit in-memory. 55 | 56 | fraud_batch_scoring <- function(Untagged_Transactions, 57 | Account_Info, 58 | LocalWorkDir, 59 | HDFSWorkDir, 60 | Stage = "Prod") 61 | { 62 | 63 | # Directory that holds the tables and model from the Development stage. 64 | LocalModelsDir <- file.path(LocalWorkDir, "model") 65 | 66 | if((class(Untagged_Transactions) == "data.frame") & (class(Account_Info) == "data.frame")){ # In-memory scoring. 67 | source("./in_memory_scoring.R") 68 | print("Scoring in-memory...") 69 | return(in_memory_scoring(Untagged_Transactions, Account_Info, Stage = Stage)) 70 | 71 | } else{ # Using Spark for scoring. 72 | 73 | rxSparkConnect(consoleOutput = TRUE, reset = TRUE) 74 | 75 | # step0: intermediate directories creation. 76 | print("Creating Intermediate Directories on Local and HDFS...") 77 | source("./step0_directories_creation.R") 78 | 79 | # step1: merging the raw data. 80 | source("./step1_merge_account_info.R") 81 | print("Step 1: Production data merging.") 82 | 83 | merge_account_info(Untagged_Transactions = Untagged_Transactions, 84 | Account_Info = Account_Info, 85 | HDFSWorkDir = HDFSWorkDir, 86 | Stage = Stage) 87 | 88 | # step2: additional preprocessing. 89 | source("./step4_preprocessing.R") 90 | print("Step 2: Additional preprocessing of the production data.") 91 | 92 | preprocess(HDFSWorkDir = HDFSWorkDir, 93 | HiveTable = "TaggedProd") 94 | 95 | 96 | # step3: feature engineering 97 | source("./step6_feature_engineering.R") 98 | print("Step 3: Feature Engineering.") 99 | 100 | feature_engineering(LocalWorkDir = LocalWorkDir, 101 | HDFSWorkDir = HDFSWorkDir, 102 | HiveTable = "TaggedProdProcessed", 103 | Stage = Stage) 104 | 105 | # step4: making predictions. 106 | source("./step8_prediction.R") 107 | print("Step 4: Making Predictions.") 108 | 109 | prediction(HDFSWorkDir = HDFSWorkDir, 110 | LocalWorkDir = LocalWorkDir, 111 | Input_Data_Xdf = "TaggedProdProcessedFeatures", 112 | Stage = Stage) 113 | 114 | # Return the directory storing the final scores. 115 | return(file.path(HDFSWorkDir,"temp", "PredictScore")) 116 | 117 | } 118 | } 119 | 120 | ############################################################################################################################## 121 | ## Apply the main function 122 | ############################################################################################################################## 123 | 124 | # Case 1: Input are data frames. Scoring is performed in-memory. 125 | Scores <- fraud_batch_scoring(Untagged_Transactions = Untagged_Transactions_df, 126 | Account_Info = Account_Info_df, 127 | LocalWorkDir = LocalWorkDir, 128 | HDFSWorkDir = HDFSWorkDir, 129 | Stage = "Prod") 130 | 131 | # Case 2: Input are paths to csv files. Scoring using Spark. 132 | ## This alternative is slow and should only be used if the data set to score is too large to fit in memory. 133 | #scores_directory <- fraud_batch_scoring(Untagged_Transactions = Untagged_Transactions_str, 134 | # Account_Info = Account_Info_str, 135 | # LocalWorkDir = LocalWorkDir, 136 | # HDFSWorkDir = HDFSWorkDir, 137 | # Stage = "Prod") 138 | 139 | # Warning: in case you get the following error: "Error: file.exists(inData1) is not TRUE", 140 | # you should reset your R session with Ctrl + Shift + F10 (or Session -> Restart R) and try running it again. 141 | -------------------------------------------------------------------------------- /RSparkCluster/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Fraud Detection 4 | ## Implemented on SQL Server 2016 R Services and HDInsight Spark 5 | 6 | For all documentation, visit the [Fraud Detection website](https://microsoft.github.io/r-server-fraud-detection/). 7 | 8 | **NOTE:** Please don't use "Download ZIP" to get this repository, as it will change the line endings in the data files. Use "git clone" to get a local copy of this repository. 9 | 10 | # Contributing 11 | 12 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 13 | -------------------------------------------------------------------------------- /RSparkCluster/step0_directories_creation.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following: 3 | ## 1. Create or clean up an intermediate directory, LocalIntermediateDir, on the edge node. 4 | ## 2. Create or clean up an intermediate directory, HDFSIntermediateDir, on HDFS. 5 | 6 | ########################################################################################################################################## 7 | 8 | # Intermediate folders paths one on the edge node and one on HDFS. 9 | LocalIntermediateDir <- file.path(LocalWorkDir, "temp") 10 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 11 | 12 | # Clean up the folders if they already exist and create them otherwise. 13 | if(dir.exists(LocalIntermediateDir)){ 14 | system(paste("rm -rf ",LocalIntermediateDir,"/*", sep="")) # clean up the directory if exists 15 | } else { 16 | dir.create(LocalIntermediateDir, recursive = TRUE) # make new directory if doesn't exist 17 | } 18 | 19 | if(rxHadoopFileExists(HDFSIntermediateDir)){ 20 | rxHadoopRemoveDir(HDFSIntermediateDir, skipTrash = TRUE) 21 | rxHadoopMakeDir(HDFSIntermediateDir) 22 | } else { 23 | rxHadoopMakeDir(HDFSIntermediateDir) 24 | } 25 | 26 | # Grant access authority for the edge node intermediate folder. 27 | system(paste("chmod g+s ", LocalIntermediateDir, sep="")) 28 | system(paste("setfacl -d -m g::rwx ", LocalIntermediateDir, sep="")) 29 | system(paste("setfacl -d -m o::rwx ", LocalIntermediateDir, sep="")) -------------------------------------------------------------------------------- /RSparkCluster/step1_merge_account_info.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following: 3 | ## 1. Convert the UntaggedTransaction and AccountInfo data sets to hive tables. 4 | ## 2. Create the transactionDateTime variable based on transactionDate and transactionTime. 5 | ## 3. Merge the two tables Untagged_Transaction ad Account_Info. 6 | ## 4. Remove duplicates. 7 | 8 | ## Input : 1. 2 Data Tables: Untagged_Transactions, Account_Info. 9 | ## 2. HDFSWorkDir: the working directory on HDFS. 10 | ## 3. Stage: "Dev" for development, "Prod" for batch scoring, "Web" for web scoring. 11 | ## Output: Hive table: UntaggedTransactionsAccountUnique (Stage = "Dev") or TaggedProd (Stage = "Prod" or "Web"). 12 | 13 | ########################################################################################################################################## 14 | 15 | merge_account_info <- function(Untagged_Transactions, 16 | Account_Info, 17 | HDFSWorkDir, 18 | Stage) 19 | { 20 | 21 | # For the Production or Web-Scoring stages, in order to avoid overwriting hive tables from the Development stage, 22 | # we will add the suffix Prod to the table names. This is encoded in the variable hive_name that will be 23 | ## an empty string for Dev 24 | ## "Prod" for Prod or Web. 25 | if(Stage == "Dev"){ 26 | hive_name <- "" 27 | }else{ 28 | hive_name <- "Prod" 29 | } 30 | 31 | # Define the intermediate directory that will hold the intermediate data. 32 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 33 | 34 | 35 | ############################################################################################################################## 36 | ## The block below will convert the data format to Hive in order to increase the efficiency of rx functions. 37 | ############################################################################################################################## 38 | 39 | print("Converting the input data to Hive on HDFS...") 40 | 41 | # Create Hive pointers for the 3 data sets on HDFS. 42 | Untagged_Transactions_hive <- RxHiveData(table = sprintf("UntaggedTransactions%s", hive_name)) 43 | Account_Info_hive <- RxHiveData(table = sprintf("AccountInfo%s", hive_name)) 44 | 45 | # Check the input format. Return an error if it is not a path. 46 | if((class(Untagged_Transactions) == "character") & (class(Account_Info) == "character")){ 47 | 48 | # Text pointers to the inputs. 49 | Untagged_Transactions_txt <- RxTextData(Untagged_Transactions, firstRowIsColNames = TRUE, fileSystem = RxHdfsFileSystem()) 50 | Account_Info_txt <- RxTextData(Account_Info, firstRowIsColNames = TRUE, fileSystem = RxHdfsFileSystem()) 51 | 52 | # Conversion to Hive tables. 53 | ## At the same time, we create transactionDateTime and recordDateTime. This is done by: 54 | ## converting transactionTime into a 6 digit time. 55 | ## concatenating transactionDate and transactionTime. 56 | ## converting it to a DateTime "%Y%m%d %H%M%S" format. 57 | rxDataStep(inData = Untagged_Transactions_txt, outFile = Untagged_Transactions_hive, overwrite = TRUE, 58 | transforms = list( 59 | transactionDateTime = as.character(as.POSIXct(paste(transactionDate, sprintf("%06d", as.numeric(transactionTime)), sep=""), format = "%Y%m%d %H%M%S", tz = "GMT")) 60 | )) 61 | 62 | rxDataStep(inData = Account_Info_txt, outFile = Account_Info_hive, overwrite = TRUE, 63 | transforms = list( 64 | recordDateTime = as.character(as.POSIXct(paste(transactionDate, sprintf("%06d", as.numeric(transactionTime)), sep=""), format = "%Y%m%d %H%M%S", tz = "GMT")) 65 | )) 66 | 67 | } else { 68 | stop("invalid input format") 69 | } 70 | 71 | ############################################################################################################################################# 72 | ## The block below will merge the two tables Untagged_Transactions and Account_Info. 73 | ############################################################################################################################################ 74 | 75 | print("Merging the 2 tables Untagged_Transactions and Account_Info ...") 76 | 77 | # Inner join of the 2 tables Untagged_Transactions and Account_Info using HIVE command 78 | Drop_Untagged_Transactions_Account_query <- sprintf("hive -e \"DROP TABLE IF EXISTS UntaggedTransactionsAccount%s\"", hive_name) 79 | Create_Untagged_Transactions_Account_query <- sprintf("hive -e \"CREATE TABLE UntaggedTransactionsAccount%s AS 80 | SELECT ut.*, latestRecord, ai.accountOwnerName, ai.accountAddress, ai.accountPostalCode, ai.accountCity, ai.accountState, 81 | ai.accountCountry, ai.accountOpenDate, ai.accountAge, ai.isUserRegistered, 82 | ai.paymentInstrumentAgeInAccount, ai.numPaymentRejects1dPerUser 83 | FROM UntaggedTransactions%s ut 84 | full outer join ( 85 | SELECT t1.accountID, max(t2.recordDateTime) as latestRecord, t1.transactionDateTime 86 | FROM UntaggedTransactions%s t1 join AccountInfo%s t2 87 | ON t2.accountID = t1.accountID 88 | WHERE t2.recordDateTime <= t1.transactionDateTime 89 | GROUP BY t1.accountID, t1.transactionDateTime 90 | ) as lastTrans 91 | ON (ut.accountID = lastTrans.accountID and ut.transactionDateTime = lastTrans.transactionDateTime) 92 | JOIN AccountInfo%s ai 93 | ON ut.accountID = ai.accountID and latestRecord = ai.recordDateTime\"", hive_name, hive_name, hive_name, hive_name, hive_name) 94 | 95 | # drop UntaggedTransactionsAccount table if exists 96 | #cat(Drop_Untagged_Transactions_Account_query) 97 | system(Drop_Untagged_Transactions_Account_query) 98 | 99 | # create table UntaggedTransactionsAccount by merging Untagged_Transactions and Account_Info tables 100 | #cat(Create_Untagged_Transactions_Account_query) 101 | system(Create_Untagged_Transactions_Account_query) 102 | 103 | ############################################################################################################################################ 104 | ## The block below will remove duplicates from UntaggedTransactionsAccount 105 | ############################################################################################################################################ 106 | 107 | print("Removing duplicates ...") 108 | 109 | Drop_UntaggedTransactionsAccountUnique_query <-sprintf(" 110 | hive -e \" 111 | DROP TABLE IF EXISTS UntaggedTransactionsAccountUnique%s\" 112 | ", hive_name) 113 | 114 | Remove_UntaggedTransactionsAccount_Duplicates_query <- sprintf(" 115 | hive -e \" 116 | CREATE TABLE UntaggedTransactionsAccountUnique%s AS 117 | SELECT t.* FROM 118 | (SELECT *, ROW_NUMBER() OVER (PARTITION BY transactionID, accountID, transactionDateTime, transactionAmount 119 | ORDER BY transactionID ASC) RN 120 | FROM UntaggedTransactionsAccount%s) as t 121 | WHERE t.RN = 1\" 122 | ", hive_name, hive_name) 123 | 124 | system(Drop_UntaggedTransactionsAccountUnique_query) 125 | system(Remove_UntaggedTransactionsAccount_Duplicates_query) 126 | 127 | ############################################################################################################################################# 128 | ## The block below will tag the UntaggedTransactionsAccount by creating a fake label for rxPredict to work correctly. 129 | ## We also exclude transactions with a negative dollar amount or missing ID variables. This preprocessing step is done in the splitting 130 | ## step for the Development stage. 131 | ############################################################################################################################################ 132 | 133 | if(Stage == "Prod" | Stage == "Web"){ 134 | print("Adding a fake label and removing rows with missing ID variables or negative transaction amount...") 135 | 136 | Drop_Tagged_query <- "hive -e \"DROP TABLE IF EXISTS TaggedProd\"" 137 | Tagging_query <- " 138 | hive -e \"create table TaggedProd as 139 | select t.*, 1 as label 140 | from UntaggedTransactionsAccountUniqueProd as t 141 | where accountID IS NOT NULL 142 | and transactionID IS NOT NULL 143 | and transactionDateTime IS NOT NULL 144 | and transactionAmountUSD >= 0\" 145 | " 146 | #cat(Drop_Tagged_query) 147 | system(Drop_Tagged_query) 148 | 149 | #cat(Tagging_query) 150 | system(Tagging_query) 151 | } 152 | 153 | print("Merging account info finished!") 154 | } -------------------------------------------------------------------------------- /RSparkCluster/step2_tagging.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following: 3 | ## 1. Convert the fraud data set to a hive table. 4 | ## 2. Create the transactionDateTime variable based on transactionDate and transactionTime for fraud table. 5 | ## 3. Remove duplicates for fraud table. 6 | ## 4. Merge the input table with fraud table and create the label at the same time. 7 | 8 | ## Input : 1. Input_Hive_Table: name of the hive table from the merging step with the untagged transactions and account info. 9 | ## 2. Path to csv Fraud files with the raw data Fraud_Transactions. 10 | ## 3. HDFSWorkDir:Working directory on HDFS. 11 | ## Output: Tagged data. 12 | 13 | ########################################################################################################################################## 14 | 15 | 16 | tagging <- function(Input_Hive_Table, 17 | Fraud_Transactions, 18 | HDFSWorkDir) 19 | { 20 | 21 | # Define the intermediate directory holding the input data. 22 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 23 | 24 | 25 | ############################################################################################################################## 26 | ## The block below will convert the data format to Hive in order to increase the efficiency of rx functions. 27 | ############################################################################################################################## 28 | 29 | print("Converting the fraud data to Hive on HDFS...") 30 | 31 | # Create Hive pointers for the 3 data sets on HDFS. 32 | Fraud_Transactions_hive <- RxHiveData(table = "FraudTransactions") 33 | 34 | # Check the input format. Return an error if it is not a path. 35 | if(class(Fraud_Transactions) == "character"){ 36 | 37 | # Text pointers to the inputs. 38 | Fraud_Transactions_txt <- RxTextData(Fraud_Transactions, firstRowIsColNames = TRUE, fileSystem = RxHdfsFileSystem()) 39 | 40 | # Conversion to Hive tables. 41 | ## At the same time, we create transactionDateTime. This is done by: 42 | ## converting transactionTime into a 6 digit time. 43 | ## concatenating transactionDate and transactionTime. 44 | ## converting it to a DateTime "%Y%m%d %H%M%S" format. 45 | rxDataStep(inData = Fraud_Transactions_txt, 46 | outFile = Fraud_Transactions_hive, 47 | overwrite = TRUE, 48 | transforms = list( 49 | transactionDateTime = as.character(as.POSIXct(paste(transactionDate, sprintf("%06d", as.numeric(transactionTime)), sep=""), format = "%Y%m%d %H%M%S", tz = "GMT")) 50 | )) 51 | 52 | } else { 53 | stop("invalid input format") 54 | } 55 | 56 | 57 | ############################################################################################################################################ 58 | ## The block below will remove duplicates from the FraudTransactions table. 59 | ############################################################################################################################################ 60 | print("Removing duplicates in the Fraud table...") 61 | 62 | Drop_FraudTransactionsUnique_query <-" 63 | hive -e \" 64 | DROP TABLE IF EXISTS FraudTransactionsUnique\" 65 | " 66 | Remove_FraudTransactions_Duplicates_query <- " 67 | hive -e \" 68 | CREATE TABLE FraudTransactionsUnique AS 69 | SELECT t.* FROM 70 | (SELECT *, ROW_NUMBER() OVER (PARTITION BY transactionID, accountID, transactionDateTime, transactionAmount 71 | ORDER BY transactionID ASC) RN 72 | FROM FraudTransactions) as t 73 | WHERE t.RN = 1\" 74 | " 75 | 76 | system(Drop_FraudTransactionsUnique_query) 77 | system(Remove_FraudTransactions_Duplicates_query) 78 | 79 | ############################################################################################################################################# 80 | ## The block below will tag the Input_Hive_Table on account level. 81 | ## The tagging is completed by merging UntaggedTransactionsAccount table with FraudTransactions table. 82 | ## The tagging logic is: 83 | # if accountID can't be found in fraud dataset => tag as 0, non fraud 84 | # if accountID found in fraud dataset but transactionDateTime is out of the fraud time range => tag as 2, pre-fraud 85 | # if accountID found in fraud dataset and transactionDateTime is within the fraud time range => tag as 1, fraud 86 | ############################################################################################################################################ 87 | print("Tagging on account level ...") 88 | 89 | Drop_Tagged_query <- "hive -e \"DROP TABLE IF EXISTS Tagged\"" 90 | Tagging_query <- paste(" 91 | hive -e \"create table Tagged as 92 | select t.*, 93 | case when sDT is not null and tDT >= sDT and tDT <= eDT then 1 94 | when sDT is not null and tDT < sDT then 2 95 | when sDT is not null and tDT > eDT then 2 96 | when sDT is null then 0 end as label 97 | from 98 | (select t1.*, t1.transactionDateTime as tDT, t2.startDateNTime as sDT, t2.endDateNTime as eDT 99 | from ", Input_Hive_Table," as t1 100 | left join 101 | (select accountID, min(transactionDateTime) as startDateNTime, max(transactionDateTime) as endDateNTime 102 | from FraudTransactionsUnique 103 | group by accountID) as t2 104 | on t1.accountID = t2.accountID) as t\" 105 | ") 106 | #cat(Drop_Tagged_query) 107 | system(Drop_Tagged_query) 108 | 109 | #cat(Tagging_query) 110 | system(Tagging_query) 111 | 112 | print("Tagging finished!") 113 | } -------------------------------------------------------------------------------- /RSparkCluster/step3_splitting.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do the following : 3 | ## 1. Hash the tagged data by accountID. 4 | ## 2. Split the tagged data set into a Training and a Testing set. 5 | 6 | 7 | ## Input : Tagged data set. 8 | ## Output: Training and Testing sets. 9 | 10 | ########################################################################################################################################## 11 | 12 | ############################################################################################################################## 13 | ## The block below will hash accountID and split data into training and testing 14 | ############################################################################################################################## 15 | 16 | ## Hash accountID 17 | print("Create HashID table by hash accountID...") 18 | 19 | Drop_HashID_query <- " 20 | hive -e \"drop table if exists HashID\" 21 | " 22 | Hashing_query <-" 23 | hive -e \"create table HashID as 24 | select accountID, abs(hash(accountID)%100) as hashCode from Tagged\" 25 | " 26 | system(Drop_HashID_query) 27 | system(Hashing_query) 28 | 29 | ## Split into training and testing 30 | print("Split Tagged data into training and testing based on hashCode...") 31 | 32 | Drop_TaggedTraining_query <- " 33 | hive -e \"drop table if exists TaggedTraining\" 34 | " 35 | Get_TaggedTraining_query <- " 36 | hive -e \"CREATE TABLE TaggedTraining AS 37 | SELECT label, accountID, transactionID, transactionDateTime, isProxyIP, paymentInstrumentType, cardType, paymentBillingAddress, 38 | paymentBillingPostalCode, paymentBillingCountryCode, paymentBillingName, accountAddress, accountPostalCode, 39 | accountCountry, accountOwnerName, shippingAddress, transactionCurrencyCode,localHour, ipState, ipPostCode, 40 | ipCountryCode, browserLanguage, paymentBillingState, accountState, transactionAmountUSD, digitalItemCount, 41 | physicalItemCount, accountAge, paymentInstrumentAgeInAccount, numPaymentRejects1dPerUser, isUserRegistered, 42 | transactionDate, transactionTime 43 | FROM Tagged 44 | WHERE accountID IN (SELECT accountID from HashID WHERE hashCode <= 70) 45 | AND label != 2 46 | AND accountID IS NOT NULL 47 | AND transactionID IS NOT NULL 48 | AND transactionDateTime IS NOT NULL 49 | AND transactionAmountUSD >= 0\" 50 | " 51 | system(Drop_TaggedTraining_query) 52 | system(Get_TaggedTraining_query) 53 | 54 | Drop_TaggedTesting_query <- " 55 | hive -e \"drop table if exists TaggedTesting\" 56 | " 57 | Get_TaggedTesting_query <- " 58 | hive -e \"CREATE TABLE TaggedTesting AS 59 | SELECT label, accountID, transactionID, transactionDateTime, isProxyIP, paymentInstrumentType, cardType, paymentBillingAddress, 60 | paymentBillingPostalCode, paymentBillingCountryCode, paymentBillingName, accountAddress, accountPostalCode, 61 | accountCountry, accountOwnerName, shippingAddress, transactionCurrencyCode,localHour, ipState, ipPostCode, 62 | ipCountryCode, browserLanguage, paymentBillingState, accountState, transactionAmountUSD, digitalItemCount, 63 | physicalItemCount, accountAge, paymentInstrumentAgeInAccount, numPaymentRejects1dPerUser, isUserRegistered, 64 | transactionDate, transactionTime 65 | FROM Tagged 66 | WHERE accountID IN (SELECT accountID from HashID WHERE hashCode > 70) 67 | AND label != 2 68 | AND accountID IS NOT NULL 69 | AND transactionID IS NOT NULL 70 | AND transactionDateTime IS NOT NULL 71 | AND transactionAmountUSD >= 0\" 72 | " 73 | system(Drop_TaggedTesting_query) 74 | system(Get_TaggedTesting_query) 75 | 76 | print("Splitting finished!") 77 | 78 | -------------------------------------------------------------------------------- /RSparkCluster/step4_preprocessing.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will perform preprocessing on an input data. 3 | 4 | ## Input : 1. HDFSWorkDir:Working directory on HDFS. 5 | ## 2. HiveTable: Input data name of Hive table to be preprocessed. 6 | ## Output: Hive table with preprocessed data. 7 | 8 | ########################################################################################################################################## 9 | 10 | preprocess <- function(HDFSWorkDir, 11 | HiveTable){ 12 | 13 | # Define the intermediate directory holding the input data. 14 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 15 | 16 | 17 | # get variables with missing values 18 | print("getting variable names with missing values...") 19 | 20 | # Point to the input hive table, while converting the strings to factors for correct computations with rxSummary. 21 | factorRiskInfo <- mapply(function(names){list(type = "factor")}, 22 | c("paymentinstrumenttype", 23 | "cardtype", 24 | "paymentbillingpostalcode", 25 | "paymentbillingcountrycode", 26 | "accountpostalcode", 27 | "accountcountry", 28 | "transactioncurrencycode", 29 | "ipstate", 30 | "ippostcode", 31 | "browserlanguage", 32 | "paymentbillingstate", 33 | "accountstate", 34 | "isuserregistered" 35 | ) , 36 | SIMPLIFY = FALSE) 37 | 38 | Input_Table_hive <- RxHiveData(table = HiveTable) 39 | Input_Table_hivefactors <- RxHiveData(table = HiveTable, colInfo = factorRiskInfo) 40 | 41 | var <- rxGetVarNames(Input_Table_hive) 42 | formula <- as.formula(paste("~", paste(var, collapse = "+"))) 43 | summary <- rxSummary(formula, Input_Table_hivefactors, byTerm = TRUE) 44 | variables_NA <- summary$sDataFrame[summary$sDataFrame$MissingObs > 0, 1] 45 | variables_NA <- variables_NA[!variables_NA %in% c("accountid", "transactionid", "transactiondatetime", "transactiondate", "transactiontime")] 46 | 47 | # If no missing values, we will only preprocess the data. Otherwise, we clean and preprocess. 48 | if(length(variables_NA) == 0){ 49 | print("No missing values: only preprocessing will be performed.") 50 | } else{ 51 | print("Variables containing missing values are:") 52 | print(variables_NA) 53 | } 54 | 55 | preprocessing <- function(data) { 56 | data <- data.frame(data, stringsAsFactors = FALSE) 57 | 58 | # Replace missing values with 0 except for localHour with -99. 59 | if(length(var_with_NA) > 0){ 60 | for(i in 1:length(var_with_NA)){ 61 | row_na <- which(is.na(data[, var_with_NA[i]])) 62 | if(var_with_NA[i] == c("localhour")){ 63 | data[row_na, var_with_NA[i]] <- "-99" 64 | } else{ 65 | data[row_na, var_with_NA[i]] <- "0" 66 | } 67 | } 68 | } 69 | 70 | # Fix some data entries in isUserRegistered, which should be binary. 71 | row_na <- which(data[, c("isuserregistered")] %in% as.character(seq(1, 9))) 72 | data[row_na, c("isuserregistered")] <- "0" 73 | 74 | # Convert a few variables to numeric, replacing non-numeric entries with 0. a few other variables to fix some data entries. 75 | numeric_to_fix <- c("accountage", "paymentinstrumentageinaccount", "numpaymentrejects1dperuser", "transactionamountusd", 76 | "digitalitemcount", "physicalitemcount") 77 | for(i in 1:length(numeric_to_fix)){ 78 | data[, numeric_to_fix[i]] <- as.numeric(data[, numeric_to_fix[i]]) 79 | row_na <- which(is.na(as.numeric(data[, numeric_to_fix[i]]))) 80 | data[row_na, numeric_to_fix[i]] <- 0 81 | } 82 | return(data) 83 | } 84 | 85 | # Output pointer. 86 | Output_Table_hive <- RxHiveData(table = paste(HiveTable,"Processed",sep="")) 87 | 88 | # set compute context to local 89 | print("preprocessing...") 90 | rxDataStep(inData = Input_Table_hive, 91 | outFile = Output_Table_hive, 92 | overwrite = TRUE, 93 | transformFunc = preprocessing, 94 | transformObjects = list(var_with_NA = variables_NA) 95 | ) 96 | 97 | print("Preprocessing finished!") 98 | } -------------------------------------------------------------------------------- /RSparkCluster/step5_create_risk_tables.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will create the risk tables for various character variables. 3 | 4 | ## Input: 1. LocalWorkDir and HDFSWorkDir: working directories on HDFS and local edge node. 5 | ## 2. HiveTable: name of the Hive table containing the preprocessed training set to be used to create risk tables. 6 | ## 3. smooth1 and smooth2: smoothing parameters used to compute the risk values. 7 | 8 | ## Output: Risk tables embedded in a list Risk_list, saved on the edge node. 9 | ########################################################################################################################################## 10 | 11 | create_risk_tables <- function(LocalWorkDir, 12 | HDFSWorkDir, 13 | HiveTable, 14 | smooth1, 15 | smooth2){ 16 | 17 | # Define the intermediate directory holding the input data. 18 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 19 | 20 | # Define the directory where Risk tables will be saved in the Development stage. 21 | LocalModelsDir <- file.path(LocalWorkDir, "model") 22 | 23 | # Variables for which we create Risk Tables. 24 | risk_vars <- c("transactioncurrencycode", "localhour", "ipstate", "ippostcode","ipcountrycode", "browserlanguage", 25 | "accountpostalcode", "accountstate", "accountcountry", "paymentbillingpostalcode", "paymentbillingstate", 26 | "paymentbillingcountrycode") 27 | 28 | # Point to the input hive table, while converting the strings to factors for correct computations with rxSummary. 29 | factorRiskInfo <- mapply(function(names){list(type = "factor")}, risk_vars, SIMPLIFY = FALSE) 30 | Tagged_Processed_hivefactors <- RxHiveData(table = HiveTable, colInfo = factorRiskInfo) 31 | 32 | # Count the number of fraud and non-fraud observations for each level of the variables in risk_vars. 33 | ## This is done in the following way: 34 | ## rxExecBy will split the Hive table according to the key argument (here label). 35 | ## The.counts function is then executed on each of the 2 splits and it returns the counts for each level of the variables. 36 | 37 | .counts <- function(keys, data, risk_vars){ 38 | formula <- as.formula(paste("~", paste(risk_vars, collapse = "+"))) 39 | summary <- rxSummary(formula = formula, data = data, byTerm = TRUE) 40 | Summary_Counts <- summary$categorical 41 | return(Summary_Counts) 42 | } 43 | 44 | counts_by_label_list <- rxExecBy(inData = Tagged_Processed_hivefactors, 45 | keys = c("label"), 46 | func = .counts, 47 | funcParams = list(risk_vars = risk_vars)) 48 | 49 | # Get the 2 lists of count tables, one for each label. 50 | ## We use the $keys value to know which split corresponded to label = 0 and which one to label = 1. 51 | fraud_key <- ifelse(unlist(counts_by_label_list[[1]]$keys) == 1, 1, 2) 52 | non_fraud_key <- ifelse(fraud_key == 1, 2, 1) 53 | Fraud_Counts_list <- counts_by_label_list[[fraud_key]]$result 54 | Non_Fraud_Counts_list <- counts_by_label_list[[non_fraud_key]]$result 55 | 56 | # Renaming column names accordingly to the label. 57 | names(Fraud_Counts_list) <- lapply(Fraud_Counts_list, FUN = function(x){colnames(x)[1]}) 58 | names(Non_Fraud_Counts_list) <- lapply(Non_Fraud_Counts_list, FUN = function(x){colnames(x)[1]}) 59 | Fraud_Counts_list <- lapply(Fraud_Counts_list, FUN = function(df){setNames(df, c(colnames(df)[1],"fraudCount"))}) 60 | Non_Fraud_Counts_list <- lapply(Non_Fraud_Counts_list, FUN = function(df){setNames(df, c(colnames(df)[1],"nonFraudCount"))}) 61 | 62 | # Merging the results into 1 list of data frames. 63 | Counts_list <- mapply(FUN = function(df1, df2){merge(df1, df2, all = TRUE)}, Fraud_Counts_list, Non_Fraud_Counts_list, SIMPLIFY = FALSE) 64 | 65 | # Replace NA with 0 (case when a level was not present for one of the labels). 66 | Counts_list <- lapply(Counts_list, FUN = function(df){df[is.na(df)] <- 0; return(df)}) 67 | 68 | # Create the risk tables. 69 | ## Function for 1 data frame in the Counts_list. 70 | compute_risk_values <- function(df){ 71 | # Compute the smoothed odds for every level of the variable. 72 | df$Odds <- (df$fraudCount + smooth1)/(df$nonFraudCount + df$fraudCount + smooth2) 73 | # Compute the log of the smoothed odds ratio. This is the risk value. 74 | df$Risk <- log(df$Odds/(1-df$Odds)) 75 | return(df[, c(1,5)]) 76 | } 77 | 78 | ## Apply compute_risk_values to every table of the Counts_list. 79 | Risk_list <- lapply(Counts_list, FUN = compute_risk_values) 80 | 81 | # Save it to the LocalModelsDir for future use. 82 | saveRDS(Risk_list, file.path(LocalModelsDir, "Risk_list.rds")) 83 | 84 | print("Creating the Risk Tables finished!") 85 | print(sprintf("Risk tables created and saved on the edge node at %s", file.path(LocalModelsDir, "Risk_list.rds"))) 86 | 87 | } 88 | 89 | -------------------------------------------------------------------------------- /RSparkCluster/step7_training.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will train a gradient boosted trees (GBT) model on input data. 3 | 4 | ## Input : 1. LocalWorkDir and HDFSWorkDir: working directories on HDFS and local edge node. 5 | ## 2. Input_Data_Xdf: training data. 6 | 7 | ## Output: Trained random forest model object. 8 | 9 | ########################################################################################################################################## 10 | 11 | training <- function(HDFSWorkDir, 12 | LocalWorkDir, 13 | Input_Data_Xdf) 14 | { 15 | 16 | # Load MicrosoftML library for rxFastTrees. 17 | library("MicrosoftML") 18 | 19 | # Define the intermediate directory holding the input data. 20 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 21 | 22 | # Define the directory where Risk tables will be loaded from. 23 | LocalModelsDir <- file.path(LocalWorkDir, "model") 24 | 25 | Tagged_Training_Processed_Features_Xdf <- RxXdfData(file.path(HDFSIntermediateDir, Input_Data_Xdf), fileSystem = RxHdfsFileSystem()) 26 | 27 | # Make equations 28 | print("Making equations for training ...") 29 | variables_all <- rxGetVarNames(Tagged_Training_Processed_Features_Xdf) 30 | variables_to_remove <- c("label", "accountid", "transactionid", "transactiondatetime", "transactiondate","transactiontime", 31 | "transactioncurrencycode", "localhour", "ipstate", "ippostcode","ipcountrycode", "browserlanguage", 32 | "accountpostalcode", "accountstate", "accountcountry", "paymentbillingpostalcode", "paymentbillingstate", 33 | "paymentbillingcountrycode","paymentbillingaddress", "paymentbillingname", "accountaddress", "accountownername", "shippingaddress") 34 | 35 | training_variables <- variables_all[!(variables_all %in% variables_to_remove)] 36 | equation <- paste("label ~ ", paste(training_variables, collapse = "+", sep=""), sep="") 37 | 38 | # Train the GBT model. 39 | print("Training random forest model...") 40 | #rxSetComputeContext('local') 41 | #boosted_fit <- rxFastTrees(formula = as.formula(equation), 42 | # data = Tagged_Training_Processed_Features_Xdf, 43 | # type = c("binary"), 44 | # numTrees = 100, 45 | # learningRate = 0.2, 46 | # splitFraction = 5/24, 47 | # featureFraction = 1, 48 | # minSplit = 10, 49 | # unbalancedSets = TRUE, 50 | # randomSeed = 5) 51 | 52 | boosted_fit <- rxDForest(formula = as.formula(equation), 53 | data = Tagged_Training_Processed_Features_Xdf, 54 | nTree = 2, 55 | timesToRun = 20, 56 | seed = 5, 57 | method = "class", 58 | scheduleOnce = TRUE, 59 | computeOobError=-1 ) 60 | 61 | # Save the fitted model to the local edge node 62 | saveRDS(boosted_fit, file = paste(LocalModelsDir, "/gbt_model.rds", sep = "")) 63 | print("Training finished!") 64 | print(paste("Model is saved on the edge node under ", LocalModelsDir, sep="")) 65 | } -------------------------------------------------------------------------------- /RSparkCluster/step8_prediction.R: -------------------------------------------------------------------------------- 1 | ########################################################################################################################################## 2 | ## This R script will do batch scoring and evaluation 3 | 4 | ## Input: 1. LocalWorkDir and HDFSWorkDir: working directories on HDFS and local edge node. 5 | ## 2. Input_Data_Xdf: input data name of xdf file to be scored. 6 | ## 3. Stage: "Dev" for development, "Prod" for batch scoring, "Web" for web scoring. 7 | ## Output: Scored data set. 8 | 9 | ########################################################################################################################################## 10 | 11 | prediction <- function(HDFSWorkDir, 12 | LocalWorkDir, 13 | Input_Data_Xdf, 14 | Stage) 15 | { 16 | 17 | # Load the Microsoft ML library for rxPredict on the GBT model. 18 | library("MicrosoftML") 19 | 20 | # Define the intermediate directory holding the input data. 21 | HDFSIntermediateDir <- file.path(HDFSWorkDir,"temp") 22 | 23 | # Get the GBT model. 24 | if(Stage == "Dev" | Stage == "Prod"){ 25 | # Define the directory where the model will be loaded from. 26 | LocalModelsDir <- file.path(LocalWorkDir, "model") 27 | 28 | # Import the model from LocalModelsDir 29 | boosted_fit <- readRDS(file.path(LocalModelsDir,"gbt_model.rds")) 30 | 31 | }else{ 32 | boosted_fit <- model_objects$boosted_fit 33 | } 34 | 35 | print("Scoring the Random Forest...") 36 | 37 | # Pointer to the Xdf data to be scored 38 | Score_Data_Xdf <- RxXdfData(file.path(HDFSIntermediateDir,Input_Data_Xdf), fileSystem = RxHdfsFileSystem()) 39 | 40 | # Pointer to the Xdf data of output 41 | Predict_Score_Xdf <- RxXdfData(file.path(HDFSIntermediateDir,"PredictScore"), fileSystem = RxHdfsFileSystem()) 42 | 43 | # Make predictions. 44 | rxPredict(modelObject = boosted_fit, 45 | data = Score_Data_Xdf, 46 | type = c("prob"), 47 | outData = Predict_Score_Xdf, 48 | overwrite = TRUE, 49 | extraVarsToWrite = c("accountid", "transactionid", "transactiondate","transactiontime", "transactionamountusd", "label")) 50 | 51 | if(Stage == "Dev"){ 52 | # Save the Predictions data as a Hive table to be used in PowerBI for visualizations (only used in the Dev Stage). 53 | Predict_Score_hive <- RxHiveData(table = "PredictScore") 54 | rxDataStep(inData = Predict_Score_Xdf, outFile = Predict_Score_hive, overwrite = TRUE) 55 | } 56 | 57 | print("Scoring Finished!") 58 | } -------------------------------------------------------------------------------- /Resources/ActionScripts/ConfigureSQL.ps1: -------------------------------------------------------------------------------- 1 | [CmdletBinding()] 2 | param( 3 | [parameter(Mandatory=$true, Position=1)] 4 | [string]$serverName, 5 | 6 | [parameter(Mandatory=$true, Position=2)] 7 | [string]$SolutionName, 8 | 9 | [parameter(Mandatory=$true, Position=3)] 10 | [string]$InstallPy, 11 | 12 | [parameter(Mandatory=$true, Position=4)] 13 | [string]$InstallR 14 | ) 15 | 16 | 17 | 18 | $db = $dbName 19 | 20 | $dataList = ("Account_Info", "Fraud_Transactions", "Untagged_Transactions") 21 | 22 | ########################################################################## 23 | 24 | # Create Database and BaseTables 25 | 26 | ######################################################################### 27 | 28 | #################################################################### 29 | # Check to see If SQL Version is at least SQL 2017 and Not SQL Express 30 | #################################################################### 31 | 32 | 33 | $query = 34 | "select 35 | case 36 | when 37 | cast(left(cast(serverproperty('productversion') as varchar), 4) as numeric(4,2)) >= 14 38 | and CAST(SERVERPROPERTY ('edition') as varchar) Not like 'Express%' 39 | then 'Yes' 40 | else 'No' end as 'isSQL17'" 41 | 42 | $isCompatible = Invoke-Sqlcmd -ServerInstance $ServerName -Database Master -Query $query 43 | $isCompatible = $isCompatible.Item(0) 44 | if ($isCompatible -eq 'Yes' -and $InstallPy -eq 'Yes') { 45 | Write-Host 46 | ("This Version of SQL is Compatible with SQL Py") 47 | 48 | ## Create Py Database 49 | Write-Host 50 | ("Creating SQL Database for Py") 51 | 52 | Write-Host 53 | ("Using $ServerName SQL Instance") 54 | 55 | ## Create PY Server DB 56 | $dbName = $db + "_Py" 57 | $SqlParameters = @("dbName=$dbName") 58 | 59 | $CreateSQLDB = "$ScriptPath\CreateDatabase.sql" 60 | 61 | $CreateSQLObjects = "$ScriptPath\CreateSQLObjectsPy.sql" 62 | Write-Host 63 | ("Calling Script to create the $dbName database") 64 | 65 | invoke-sqlcmd -inputfile $CreateSQLDB -serverinstance $ServerName -database master -Variable $SqlParameters 66 | 67 | 68 | Write-Host 69 | ("SQLServerDB $dbName Created") 70 | 71 | invoke-sqlcmd "USE $dbName;" 72 | 73 | Write-Host 74 | ("Calling Script to create the objects in the $dbName database") 75 | 76 | invoke-sqlcmd -inputfile $CreateSQLObjects -serverinstance $ServerName -database $dbName 77 | 78 | Write-Host 79 | ("SQLServerObjects Created in $dbName Database") 80 | 81 | $OdbcName = "obdc" + $dbname 82 | ## Create ODBC Connection for PowerBI to Use 83 | Add-OdbcDsn -Name $OdbcName -DriverName "ODBC Driver 13 for SQL Server" -DsnType 'System' -Platform '64-bit' -SetPropertyValue @("Server=$ServerName", "Trusted_Connection=Yes", "Database=$dbName") -ErrorAction SilentlyContinue -PassThru 84 | 85 | } 86 | else 87 | { 88 | if ($isCompatible -eq 'Yes' -and $InstallPy -eq 'Yes') 89 | { 90 | Write-Host 91 | ("This Version of SQL is not compatible with Py , Py Code and DB's will not be Created") 92 | } 93 | else 94 | { 95 | Write-Host 96 | ("There is not a py version of this solution") 97 | } 98 | } 99 | 100 | 101 | 102 | 103 | If ($InstallR -eq 'Yes') 104 | { 105 | Write-Host 106 | ("Creating SQL Database for R") 107 | 108 | 109 | $dbName = $db + "_R" 110 | 111 | ## Create RServer DB 112 | $SqlParameters = @("dbName=$dbName") 113 | 114 | $CreateSQLDB = "$ScriptPath\CreateDatabase.sql" 115 | 116 | $CreateSQLObjects = "$ScriptPath\CreateSQLObjectsR.sql" 117 | Write-Host 118 | ("Calling Script to create the $dbName database") 119 | invoke-sqlcmd -inputfile $CreateSQLDB -serverinstance $ServerName -database master -Variable $SqlParameters 120 | 121 | 122 | Write-Host 123 | ("SQLServerDB $dbName Created") 124 | invoke-sqlcmd "USE $dbName;" 125 | 126 | Write-Host 127 | ("Calling Script to create the objects in the $dbName database") 128 | invoke-sqlcmd -inputfile $CreateSQLObjects -serverinstance $ServerName -database $dbName 129 | 130 | 131 | Write-Host 132 | ("SQLServerObjects Created in $dbName Database") 133 | 134 | 135 | ###Configure Database for R 136 | Write-Host 137 | ("Configuring $SolutionName Solution for R") 138 | 139 | $dbName = $db + "_R" 140 | 141 | ## Create ODBC Connection for PowerBI to Use 142 | $OdbcName = "obdc" + $dbname 143 | ## Create ODBC Connection for PowerBI to Use 144 | Add-OdbcDsn -Name $OdbcName -DriverName "ODBC Driver 13 for SQL Server" -DsnType 'System' -Platform '64-bit' -SetPropertyValue @("Server=$ServerName", "Trusted_Connection=Yes", "Database=$dbName") -ErrorAction SilentlyContinue -PassThru 145 | 146 | 147 | ########################################################################## 148 | # Deployment Pipeline 149 | ########################################################################## 150 | 151 | $RStart = Get-Date 152 | try 153 | { 154 | Write-Host 155 | ("Import CSV File(s). This Should take about 30 Seconds Per File") 156 | 157 | # upload csv files into SQL tables 158 | foreach ($dataFile in $dataList) 159 | { 160 | $destination = $SolutionData + $dataFile + ".csv" 161 | $tableName = $DBName + ".dbo." + $dataFile 162 | $tableSchema = $dataPath + "\" + $dataFile + ".xml" 163 | $dataSet = Import-Csv $destination 164 | Write-Host 165 | ("Loading $dataFile.csv into SQL Table") 166 | Write-SqlTableData -InputData $dataSet -DatabaseName $dbName -Force -Passthru -SchemaName dbo -ServerInstance $ServerName -TableName $dataFile 167 | Write-Host 168 | ("$datafile table loaded from CSV File(s).") 169 | } 170 | } 171 | catch 172 | { 173 | Write-Host -ForegroundColor DarkYellow "Exception in populating database tables:" 174 | Write-Host -ForegroundColor Red $Error[0].Exception 175 | throw 176 | } 177 | Write-Host 178 | ("Finished loading .csv File(s).") 179 | 180 | Write-Host 181 | ("Training Model and Scoring Data...") 182 | 183 | $query = "EXEC Initial_Run_Once_R" 184 | SqlServer\Invoke-Sqlcmd -ServerInstance LocalHost -Database $dbName -Query $query -ConnectionTimeout 0 -QueryTimeout 0 185 | 186 | $Rend = Get-Date 187 | 188 | $Duration = New-TimeSpan -Start $RStart -End $Rend 189 | Write-Host 190 | ("R Server Configured in $Duration") 191 | } 192 | ELSE 193 | { 194 | Write-Host 195 | ("There is not a R Version for this Solution so R will not be Installed") 196 | } 197 | 198 | 199 | ###Conifgure Database for Py 200 | if ($isCompatible -eq 'Yes'-and $InstallPy -eq 'Yes') 201 | 202 | { 203 | { 204 | $PyStart = get-date 205 | Write-Host 206 | ("Configuring $SolutionName Solution for Py") 207 | $dbname = $db + "_Py" 208 | 209 | ########################################################################## 210 | # Deployment Pipeline Py 211 | ########################################################################## 212 | 213 | 214 | try 215 | { 216 | Write-Host ("Import CSV File(s). This Should take about 30 Seconds Per File") 217 | 218 | # upload csv files into SQL tables 219 | foreach ($dataFile in $dataList) 220 | { 221 | $destination = $SolutionData + $dataFile + ".csv" 222 | $tableName = $DBName + ".dbo." + $dataFile 223 | $tableSchema = $dataPath + "\" + $dataFile + ".xml" 224 | $dataSet = Import-Csv $destination 225 | Write-Host 226 | ("Loading $dataFile.csv into SQL Table") 227 | Write-SqlTableData -InputData $dataSet -DatabaseName $dbName -Force -Passthru -SchemaName dbo -ServerInstance $ServerName -TableName $dataFile 228 | Write-Host 229 | ("$datafile table loaded from CSV File(s).") 230 | } 231 | } 232 | catch 233 | { 234 | Write-Host -ForegroundColor DarkYellow "Exception in populating database tables:" 235 | Write-Host -ForegroundColor Red $Error[0].Exception 236 | throw 237 | } 238 | Write-Host 239 | ("Finished loading .csv File(s).") 240 | } 241 | 242 | Write-Host 243 | ("Training Model and Scoring Data...") 244 | $query = "EXEC Inital_Run_Once_Py" 245 | SqlServer\Invoke-Sqlcmd -ServerInstance LocalHost -Database $dbName -Query $query -ConnectionTimeout 0 -QueryTimeout 0 246 | 247 | $Pyend = Get-Date 248 | 249 | $Duration = New-TimeSpan -Start $PyStart -End $Pyend 250 | Write-Host 251 | ("Py Server Configured in $Duration") 252 | } 253 | 254 | -------------------------------------------------------------------------------- /Resources/ActionScripts/CreateDatabase.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | BEGIN 6 | DECLARE 7 | @DbName VARCHAR(400) = N'$(dbName)', 8 | @ServerName varchar(100) = (SELECT CAST(SERVERPROPERTY('ServerName') as Varchar)), 9 | @InstanceName varchar(100) = (SELECT CAST(SERVERPROPERTY('InstanceName') as Varchar)), 10 | @UI varchar(100), 11 | @Qry VARCHAR(MAX) 12 | 13 | 14 | ----Create Needed SQLRUsergroup Name , 15 | ----if Default Instance UI = {ServerName}\SQLRUserGroup 16 | ----if Named Instance {ServerName}\SQLRUserGroup{InstanceName} 17 | 18 | If @InstanceName is null 19 | BEGIN 20 | SET @UI = @ServerName + '\SQLRUserGroup' 21 | END 22 | 23 | If @InstanceName is Not null 24 | BEGIN 25 | SET @UI = @ServerName + '\SQLRUserGroup' + @InstanceName 26 | END 27 | 28 | 29 | 30 | SET @Qry = 31 | (' 32 | EXEC msdb.dbo.sp_delete_database_backuphistory @database_name = N'''' 33 | USE [master] 34 | ALTER DATABASE SET SINGLE_USER WITH ROLLBACK IMMEDIATE 35 | USE [master] 36 | DROP DATABASE 37 | ') 38 | 39 | 40 | --If DB Already Exists , Drop it and recreate it 41 | IF EXISTS(select * from sys.databases where name = @DbName) 42 | 43 | BEGIN 44 | SET @Qry = (REPLACE(@Qry,'',@DbName) ) 45 | EXEC (@Qry) 46 | END 47 | 48 | 49 | DECLARE @Query VARCHAR(MAX)='' 50 | ---Find Default Database File Path and Create DB there 51 | DECLARE @DbFilePath VARCHAR(400) = (SELECT top 1 LEFT(physical_name, (LEN(physical_name) - CHARINDEX('\',REVERSE(physical_name)))) + '\' as BasePath FROM sys.master_files WHERE type_desc = 'ROWS') 52 | 53 | --Find Default Log File Path and Create Log there 54 | DECLARE @LogFilePath VARCHAR(400) = (SELECT top 1 LEFT(physical_name, (LEN(physical_name) - CHARINDEX('\',REVERSE(physical_name)))) + '\' as BasePath FROM sys.master_files WHERE type_desc = 'LOG') 55 | 56 | 57 | IF NOT EXISTS(select * from sys.databases where name = @DbName) 58 | BEGIN 59 | SET @Query = @Query + 'CREATE DATABASE '+@DbName +' ON PRIMARY ' 60 | SET @Query = @Query + '( NAME = '''+@DbName +''', FILENAME = '''+@DbFilePath+@DbName +'.mdf'' , SIZE = 73728KB , MAXSIZE = UNLIMITED, FILEGROWTH = 1024KB ) ' 61 | SET @Query = @Query + ' LOG ON ' 62 | SET @Query = @Query + '( NAME = '''+@DbName +'_log'', FILENAME = '''+@LogFilePath+@DbName +'_log.ldf'' , SIZE = 1024KB , MAXSIZE = 2048GB , FILEGROWTH = 1024KB)' 63 | exec(@query) 64 | END 65 | 66 | DECLARE @Alter VARCHAR(MAX) 67 | SET @Alter = 68 | ( 69 | 'ALTER DATABASE SET COMPATIBILITY_LEVEL = 130 70 | IF (1 = FULLTEXTSERVICEPROPERTY(''IsFullTextInstalled'')) 71 | begin 72 | EXEC .[dbo].[sp_fulltext_database] @action = ''enable'' 73 | end 74 | ALTER DATABASE SET ANSI_NULL_DEFAULT OFF 75 | ALTER DATABASE SET ANSI_NULLS OFF 76 | ALTER DATABASE SET ANSI_PADDING OFF 77 | ALTER DATABASE SET ANSI_WARNINGS OFF 78 | ALTER DATABASE SET ARITHABORT OFF 79 | ALTER DATABASE SET AUTO_CLOSE OFF 80 | ALTER DATABASE SET AUTO_SHRINK OFF 81 | ALTER DATABASE SET AUTO_UPDATE_STATISTICS ON 82 | ALTER DATABASE SET CURSOR_CLOSE_ON_COMMIT OFF 83 | ALTER DATABASE SET CURSOR_DEFAULT GLOBAL 84 | ALTER DATABASE SET CONCAT_NULL_YIELDS_NULL OFF 85 | ALTER DATABASE SET NUMERIC_ROUNDABORT OFF 86 | ALTER DATABASE SET QUOTED_IDENTIFIER OFF 87 | ALTER DATABASE SET RECURSIVE_TRIGGERS OFF 88 | ALTER DATABASE SET ENABLE_BROKER 89 | ALTER DATABASE SET AUTO_UPDATE_STATISTICS_ASYNC OFF 90 | ALTER DATABASE SET DATE_CORRELATION_OPTIMIZATION OFF 91 | ALTER DATABASE SET TRUSTWORTHY OFF 92 | ALTER DATABASE SET ALLOW_SNAPSHOT_ISOLATION OFF 93 | ALTER DATABASE SET PARAMETERIZATION SIMPLE 94 | ALTER DATABASE SET READ_COMMITTED_SNAPSHOT OFF 95 | ALTER DATABASE SET HONOR_BROKER_PRIORITY OFF 96 | ALTER DATABASE SET RECOVERY FULL 97 | ALTER DATABASE SET MULTI_USER 98 | ALTER DATABASE SET PAGE_VERIFY CHECKSUM 99 | ALTER DATABASE SET DB_CHAINING OFF 100 | ALTER DATABASE SET FILESTREAM( NON_TRANSACTED_ACCESS = OFF ) 101 | ALTER DATABASE SET TARGET_RECOVERY_TIME = 60 SECONDS 102 | ALTER DATABASE SET DELAYED_DURABILITY = DISABLED 103 | EXEC sys.sp_db_vardecimal_storage_format N'''', N''ON'' 104 | ALTER DATABASE SET QUERY_STORE = OFF 105 | ALTER DATABASE SET READ_WRITE' 106 | ) 107 | SET @Alter = (REPLACE(@Alter,'',@DbName)) 108 | EXEC (@Alter) 109 | 110 | ----CREATE USER SQLRUserGroup on SQL Server 111 | 112 | SET @Qry = 113 | ' 114 | IF NOT EXISTS (SELECT name FROM master.sys.server_principals where name = '''') 115 | BEGIN CREATE LOGIN [] FROM WINDOWS WITH DEFAULT_DATABASE=[master], DEFAULT_LANGUAGE=[us_english] END 116 | ' 117 | SET @Qry = REPLACE(@qry,'', @ui) 118 | 119 | EXEC (@Qry) 120 | --SELECT @Qry 121 | 122 | 123 | ----Give SQLRUserGroup Rights To Database(s) 124 | SET @Qry = 125 | ' 126 | USE [] 127 | CREATE USER [] FOR LOGIN [] 128 | 129 | ALTER USER [] WITH DEFAULT_SCHEMA=NULL 130 | 131 | ALTER AUTHORIZATION ON SCHEMA::[db_datareader] TO [] 132 | 133 | ALTER AUTHORIZATION ON SCHEMA::[db_datawriter] TO [] 134 | 135 | ALTER AUTHORIZATION ON SCHEMA::[db_ddladmin] TO [] 136 | 137 | ALTER ROLE [db_datareader] ADD MEMBER [] 138 | 139 | ALTER ROLE [db_datawriter] ADD MEMBER [] 140 | 141 | ALTER ROLE [db_ddladmin] ADD MEMBER [] 142 | ' 143 | SET @Qry = REPLACE(REPLACE(@qry,'', @ui),'',@DbName) 144 | 145 | EXEC (@Qry) 146 | --SELECT @Qry 147 | 148 | END 149 | -------------------------------------------------------------------------------- /Resources/ActionScripts/CreateSQLObjectsR.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Resources/ActionScripts/CreateSQLObjectsR.sql -------------------------------------------------------------------------------- /Resources/ActionScripts/createShortcuts.ps1: -------------------------------------------------------------------------------- 1 | <# 2 | 3 | .SYNOPSIS 4 | Script to create help short cut and solution folder shortcut. 5 | 6 | .PARAMETER helpfile 7 | path to the help url file. 8 | 9 | .PARAMETER solutionPath 10 | path to the solution folder with data and source. 11 | 12 | #> 13 | [CmdletBinding()] 14 | param( 15 | [parameter(Mandatory=$true, Position=1, ParameterSetName = "LCR")] 16 | [ValidateNotNullOrEmpty()] 17 | [string]$helpfile, 18 | 19 | [parameter(Mandatory=$true, Position=2, ParameterSetName = "LCR")] 20 | [ValidateNotNullOrEmpty()] 21 | [string]$solutionPath 22 | ) 23 | 24 | # find the desktop 25 | $desktop = [Environment]::GetFolderPath("Desktop") 26 | 27 | $desktop = $desktop + '\' 28 | 29 | 30 | #create the help link in startup program 31 | 32 | $startmenu = [Environment]::GetFolderPath("StartMenu") 33 | $startupfolder = $startmenu + '\Programs\Startup\' 34 | # We create this since the user startup folder is only created after first login 35 | # Alternative is to add is to all user startup 36 | mkdir $startupfolder 37 | #copy 38 | $down = $helpfile 39 | Write-Host $down 40 | Write-Host $startmenu 41 | ls $startmenu 42 | Write-Host $startupfolder 43 | ls $startupfolder 44 | cp -Verbose $down $startupfolder 45 | cp -Verbose $down $desktop 46 | 47 | #create shortcut to solution folder on desktop 48 | $WsShell = New-Object -ComObject WScript.Shell 49 | $shortcut = $WsShell.CreateShortcut($desktop + "Fraud.lnk") 50 | $shortcut.TargetPath = $solutionPath 51 | $shortcut.Save() -------------------------------------------------------------------------------- /Resources/ActionScripts/frauddetection_Help.url: -------------------------------------------------------------------------------- 1 | [InternetShortcut] 2 | URL=https://microsoft.github.io/r-server-fraud-detection/Typical.html 3 | IDList= 4 | HotKey=0 5 | [{000214A0-0000-0000-C000-000000000046}] 6 | Prop3=19,11 7 | -------------------------------------------------------------------------------- /Resources/ActionScripts/hdisetup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script used to setup an HDInsight Cluster deployed from Cortana Analytics Gallery 4 | # WARNING: This script is only meant to be run from the solution template deployment process. 5 | 6 | # put R code in users home directory 7 | git clone --branch master --single-branch https://github.com/Microsoft/r-server-fraud-detection.git fraud 8 | cp fraud/RSparkCluster/* /home/$1 9 | chmod 777 /home/$1/*.R 10 | rm -rf fraud 11 | sed -i "s/XXYOURPW/$2/g" /home/$1/*.R 12 | 13 | # Configure edge node as one-box setup for R Server Operationalization 14 | /usr/local/bin/dotnet /usr/lib64/microsoft-r/rserver/o16n/9.1.0/Microsoft.RServer.Utils.AdminUtil/Microsoft.RServer.Utils.AdminUtil.dll -silentoneboxinstall "$2" 15 | 16 | # turn off telemetry 17 | sed -i 's/options(mds.telemetry=1)/options(mds.telemetry=0)/g' /usr/lib64/microsoft-r/3.3/lib64/R/etc/Rprofile.site 18 | sed -i 's/options(mds.logging=1)/options(mds.logging=0)/g' /usr/lib64/microsoft-r/3.3/lib64/R/etc/Rprofile.site 19 | -------------------------------------------------------------------------------- /Resources/exampleuser.sql: -------------------------------------------------------------------------------- 1 | -- 2 | -- remove old rdemo user and login from master 3 | -- 4 | USE [master] 5 | GO 6 | IF EXISTS (SELECT name FROM sys.database_principals WHERE name = 'rdemo') 7 | BEGIN 8 | PRINT 'Deleting old rdemo user from master' 9 | DROP USER [rdemo] 10 | END 11 | GO 12 | IF EXISTS (SELECT name FROM master.sys.server_principals WHERE name = 'rdemo') 13 | BEGIN 14 | PRINT 'Deleting old rdemo login from master' 15 | DROP LOGIN [rdemo] 16 | END 17 | GO 18 | -- 19 | -- create new rdemo login in master 20 | -- 21 | USE [master] 22 | GO 23 | PRINT 'Creating rdemo login in master' 24 | CREATE LOGIN [rdemo] WITH PASSWORD=N'D@tascience', CHECK_EXPIRATION=OFF, CHECK_POLICY=OFF; 25 | CREATE USER [rdemo] FOR LOGIN [rdemo] 26 | --ALTER ROLE [db_rrerole] ADD MEMBER [rdemo] 27 | ALTER ROLE [db_owner] ADD MEMBER [rdemo] 28 | GO 29 | 30 | exec sp_addrolemember 'db_owner', 'rdemo' 31 | exec sp_addrolemember 'db_ddladmin', 'rdemo' 32 | exec sp_addrolemember 'db_accessadmin', 'rdemo' 33 | exec sp_addrolemember 'db_datareader', 'rdemo' 34 | exec sp_addrolemember 'db_datawriter', 'rdemo' 35 | exec sp_addsrvrolemember @loginame= 'rdemo', @rolename = 'sysadmin' 36 | GO 37 | -------------------------------------------------------------------------------- /Resources/images/fraud.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Resources/images/fraud.jpg -------------------------------------------------------------------------------- /Resources/readme.md: -------------------------------------------------------------------------------- 1 | Scripts in the Resources folder should only be run once through the template deployment process. They are not meant to be run by users as it assumes database and users don't already exist. 2 | 3 | # Contributing 4 | 5 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 6 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SQLR/CreateRiskTable.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to create risk table for each input variable 3 | 4 | input parameters: 5 | @name = the name of the variable to generate risk table for 6 | @table_name = the name of the output risk table 7 | */ 8 | 9 | set ansi_nulls on 10 | go 11 | 12 | set quoted_identifier on 13 | go 14 | 15 | DROP PROCEDURE IF EXISTS CreateRiskTable 16 | GO 17 | 18 | create procedure CreateRiskTable 19 | @name varchar(max), 20 | @table_name varchar(max) 21 | as 22 | begin 23 | declare @filltablesql nvarchar(max) 24 | declare @droptablesql nvarchar(max) 25 | declare @removenullconstrain nvarchar(max) 26 | declare @addprimarykey nvarchar(max) 27 | 28 | /* drop corresponding table if it already exists */ 29 | set @droptablesql = 'DROP TABLE IF EXISTS ' + @table_name 30 | exec sp_executesql @droptablesql 31 | 32 | /* create risk table */ 33 | set @filltablesql = 'select ' + @name + ' , log(odds/(1-odds)) as risk 34 | into .dbo.' + @table_name + 35 | ' from (select distinct ' + @name + ' ,cast((sum(label)+10) as float)/cast((sum(label)+sum(1-label)+100) as float) as odds 36 | from Tagged_Training_Processed group by ' + @name + ' ) temp' 37 | 38 | /* example: when @name=localHour, @table_name=Risk_LocalHour, @sql is the following: 39 | select localHour , log(odds/(1-odds)) as risk 40 | into Risk_LocalHour from (select distinct localHour ,cast((sum(label)+10) as float)/cast((sum(label)+sum(1-label)+100) as float) as odds 41 | from Tagged_Training group by localHour ) temp 42 | */ 43 | 44 | exec sp_executesql @filltablesql 45 | end -------------------------------------------------------------------------------- /SQLR/ParseString.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script creates the stored procedure to: 3 | 1. ingest a string and store it into a temporary table 4 | 2. parse the string and output the parsed string to a sql table 5 | */ 6 | 7 | set ansi_nulls on 8 | go 9 | 10 | set quoted_identifier on 11 | go 12 | 13 | DROP PROCEDURE IF EXISTS ParseStr 14 | GO 15 | 16 | create procedure ParseStr @inputstring VARCHAR(MAX) 17 | as 18 | begin 19 | 20 | /* Reformat the long string into XML format whose elements can be retrieved by location index */ 21 | declare @parsequery nvarchar(max) 22 | set @parsequery = ' 23 | DECLARE @tmp table ( ID int Identity(1,1) ,[Name] nvarchar(max)) 24 | INSERT into @tmp SELECT ''' + @inputstring + ''' 25 | drop table if exists Parsed_String 26 | ;WITH tmp AS 27 | ( 28 | SELECT 29 | CAST('''' + REPLACE([Name], '','' , '''') + '''' AS XML) 30 | AS [NameParsed] 31 | FROM @tmp 32 | ) 33 | SELECT 34 | case when [NameParsed].value(''/M[1]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[1]'', ''varchar (100)'') end As [transactionID], 35 | case when [NameParsed].value(''/M[2]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[2]'', ''varchar (100)'') end As [accountID], 36 | case when [NameParsed].value(''/M[3]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[3]'', ''varchar (100)'') end As [transactionAmountUSD], 37 | case when [NameParsed].value(''/M[4]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[4]'', ''varchar (100)'') end As transactionAmount, 38 | case when [NameParsed].value(''/M[5]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[5]'', ''varchar (100)'') end As [transactionCurrencyCode], 39 | case when [NameParsed].value(''/M[6]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[6]'', ''varchar (100)'') end As [transactionCurrencyConversionRate], 40 | case when [NameParsed].value(''/M[7]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[7]'', ''varchar (100)'') end As [transactionDate], 41 | case when [NameParsed].value(''/M[8]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[8]'', ''varchar (100)'') end As [transactionTime], 42 | case when [NameParsed].value(''/M[9]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[9]'', ''varchar (100)'') end As [localHour], 43 | case when [NameParsed].value(''/M[10]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[10]'', ''varchar (100)'') end As [transactionScenario], 44 | case when [NameParsed].value(''/M[11]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[11]'', ''varchar (100)'') end As [transactionType], 45 | case when [NameParsed].value(''/M[12]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[12]'', ''varchar (100)'') end As [transactionMethod], 46 | case when [NameParsed].value(''/M[13]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[13]'', ''varchar (100)'') end As [transactionDeviceType], 47 | case when [NameParsed].value(''/M[14]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[14]'', ''varchar (100)'') end As [transactionDeviceId], 48 | case when [NameParsed].value(''/M[15]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[15]'', ''varchar (100)'') end As [transactionIPaddress], 49 | case when [NameParsed].value(''/M[16]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[16]'', ''varchar (100)'') end As [ipState], 50 | case when [NameParsed].value(''/M[17]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[17]'', ''varchar (100)'') end As [ipPostcode], 51 | case when [NameParsed].value(''/M[18]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[18]'', ''varchar (100)'') end As [ipCountryCode], 52 | case when [NameParsed].value(''/M[19]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[19]'', ''varchar (100)'') end As [isProxyIP], 53 | case when [NameParsed].value(''/M[20]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[20]'', ''varchar (100)'') end As [browserType], 54 | case when [NameParsed].value(''/M[21]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[21]'', ''varchar (100)'') end As [browserLanguage], 55 | case when [NameParsed].value(''/M[22]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[22]'', ''varchar (100)'') end As [paymentInstrumentType], 56 | case when [NameParsed].value(''/M[23]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[23]'', ''varchar (100)'') end As [cardType], 57 | case when [NameParsed].value(''/M[24]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[24]'', ''varchar (100)'') end As [cardNumberInputMethod], 58 | case when [NameParsed].value(''/M[25]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[25]'', ''varchar (100)'') end As [paymentInstrumentID], 59 | case when [NameParsed].value(''/M[26]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[26]'', ''varchar (100)'') end As [paymentBillingAddress], 60 | case when [NameParsed].value(''/M[27]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[27]'', ''varchar (100)'') end As [paymentBillingPostalCode], 61 | case when [NameParsed].value(''/M[28]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[28]'', ''varchar (100)'') end As [paymentBillingState], 62 | case when [NameParsed].value(''/M[29]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[29]'', ''varchar (100)'') end As [paymentBillingCountryCode], 63 | case when [NameParsed].value(''/M[30]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[30]'', ''varchar (100)'') end As [paymentBillingName], 64 | case when [NameParsed].value(''/M[31]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[31]'', ''varchar (100)'') end As [shippingAddress], 65 | case when [NameParsed].value(''/M[32]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[32]'', ''varchar (100)'') end As [shippingPostalCode], 66 | case when [NameParsed].value(''/M[33]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[33]'', ''varchar (100)'') end As [shippingCity], 67 | case when [NameParsed].value(''/M[34]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[34]'', ''varchar (100)'') end As [shippingState], 68 | case when [NameParsed].value(''/M[35]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[35]'', ''varchar (100)'') end As [shippingCountry], 69 | case when [NameParsed].value(''/M[36]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[36]'', ''varchar (100)'') end As [cvvVerifyResult], 70 | case when [NameParsed].value(''/M[37]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[37]'', ''varchar (100)'') end As [responseCode], 71 | case when [NameParsed].value(''/M[38]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[38]'', ''varchar (100)'') end As [digitalItemCount], 72 | case when [NameParsed].value(''/M[39]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[39]'', ''varchar (100)'') end As [physicalItemCount], 73 | case when [NameParsed].value(''/M[40]'', ''varchar (100)'')=''NULL'' then NULL else [NameParsed].value(''/M[40]'', ''varchar (100)'') end As [purchaseProductType] 74 | into Parsed_String 75 | FROM tmp' 76 | exec sp_executesql @parsequery 77 | end -------------------------------------------------------------------------------- /SQLR/ScoreOneTrans.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script creates the stored procedure to score one transaction by invoking the following store procedure: 3 | 1. ParseStr: parse the input string and save to a sql table 4 | 2. PredictR: preprocess, feature engineer, and scoring the parsed transaction 5 | */ 6 | 7 | set ansi_nulls on 8 | go 9 | 10 | set quoted_identifier on 11 | go 12 | 13 | DROP PROCEDURE IF EXISTS ScoreOneTrans 14 | GO 15 | 16 | create procedure ScoreOneTrans @inputstring VARCHAR(MAX) 17 | as 18 | begin 19 | 20 | /* invoke ParseStr */ 21 | declare @invokeParseStr nvarchar(max) 22 | set @invokeParseStr =' 23 | exec ParseStr ''' + @inputstring + '''' 24 | exec sp_executesql @invokeParseStr 25 | 26 | /* invoke PredictR */ 27 | declare @invokePredictR nvarchar(max) 28 | set @invokePredictR =' 29 | exec PredictR ''Parsed_String'', ''Predict_Score_Single_Transaction'',''1'' 30 | ' 31 | exec sp_executesql @invokePredictR 32 | SELECT [Probability.1] FROM [Fraud].[dbo].[Predict_Score_Single_Transaction] 33 | 34 | end -------------------------------------------------------------------------------- /SQLR/SortAcctTable.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to 3 | 1. create recordDateTime column for Account_Info table 4 | 2. sort the table in account, recordDateTime with descent order 5 | */ 6 | 7 | set ansi_nulls on 8 | go 9 | 10 | set quoted_identifier on 11 | go 12 | 13 | DROP PROCEDURE IF EXISTS sortAcctTable 14 | GO 15 | 16 | create procedure sortAcctTable @table nvarchar(max) 17 | as 18 | begin 19 | 20 | declare @dropTable nvarchar(max) 21 | set @dropTable = ' 22 | drop table if exists ' + @table + '_Sort' 23 | exec sp_executesql @dropTable 24 | 25 | declare @sortAcctTableQuery nvarchar(max) 26 | set @sortAcctTableQuery = ' 27 | select *, 28 | convert(datetime,stuff(stuff(stuff(concat(transactionDate,dbo.FormatTime(transactionTime)), 9, 0, '' ''), 12, 0, '':''), 15, 0, '':'')) as recordDateTime 29 | into ' + @table + '_Sort from ' + @table + ' 30 | order by accountID, recordDateTime desc 31 | ' 32 | exec sp_executesql @sortAcctTableQuery 33 | end -------------------------------------------------------------------------------- /SQLR/Step0_CreateTables.sql: -------------------------------------------------------------------------------- 1 | /* 2 | The script will create the following tables: 3 | 1. table for untagged transactions 4 | 2. table for account information 5 | 3. table for fraud transactions 6 | 4. table storing historical transactions which will be used for calculating aggregates 7 | */ 8 | 9 | set ansi_nulls on 10 | go 11 | 12 | set quoted_identifier on 13 | go 14 | 15 | drop table if exists Untagged_Transactions 16 | create table Untagged_Transactions ( 17 | transactionID varchar(255), 18 | accountID varchar(255), 19 | transactionAmountUSD varchar(255), 20 | transactionAmount varchar(255), 21 | transactionCurrencyCode varchar(255), 22 | transactionCurrencyConversionRate varchar(255), 23 | transactionDate varchar(255), 24 | transactionTime varchar(255), 25 | localHour varchar(255), 26 | transactionScenario varchar(255), 27 | transactionType varchar(255), 28 | transactionMethod varchar(255), 29 | transactionDeviceType varchar(255), 30 | transactionDeviceId varchar(255), 31 | transactionIPaddress varchar(255), 32 | ipState varchar(255), 33 | ipPostcode varchar(255), 34 | ipCountryCode varchar(255), 35 | isProxyIP varchar(255), 36 | browserType varchar(255), 37 | browserLanguage varchar(255), 38 | paymentInstrumentType varchar(255), 39 | cardType varchar(255), 40 | cardNumberInputMethod varchar(255), 41 | paymentInstrumentID varchar(255), 42 | paymentBillingAddress varchar(255), 43 | paymentBillingPostalCode varchar(255), 44 | paymentBillingState varchar(255), 45 | paymentBillingCountryCode varchar(255), 46 | paymentBillingName varchar(255), 47 | shippingAddress varchar(255), 48 | shippingPostalCode varchar(255), 49 | shippingCity varchar(255), 50 | shippingState varchar(255), 51 | shippingCountry varchar(255), 52 | cvvVerifyResult varchar(255), 53 | responseCode varchar(255), 54 | digitalItemCount varchar(255), 55 | physicalItemCount varchar(255), 56 | purchaseProductType varchar(255) 57 | ); 58 | 59 | drop table if exists Account_Info 60 | create table Account_Info ( 61 | accountID varchar(255), 62 | transactionDate varchar(255), 63 | transactionTime varchar(255), 64 | accountOwnerName varchar(255), 65 | accountAddress varchar(255), 66 | accountPostalCode varchar(255), 67 | accountCity varchar(255), 68 | accountState varchar(255), 69 | accountCountry varchar(255), 70 | accountOpenDate varchar(255), 71 | accountAge varchar(255), 72 | isUserRegistered varchar(255), 73 | paymentInstrumentAgeInAccount varchar(255), 74 | numPaymentRejects1dPerUser varchar(255) 75 | ); 76 | 77 | drop table if exists Fraud 78 | create table Fraud ( 79 | transactionID varchar(255), 80 | accountID varchar(255), 81 | transactionAmount varchar(255), 82 | transactionCurrencyCode varchar(255), 83 | transactionDate varchar(255), 84 | transactionTime varchar(255), 85 | localHour varchar(255), 86 | transactionDeviceId varchar(255), 87 | transactionIPaddress varchar(255) 88 | ); 89 | 90 | drop table if exists Transaction_History 91 | create table Transaction_History 92 | ( 93 | accountID varchar(255), 94 | transactionID varchar(255), 95 | transactionDateTime datetime, 96 | transactionAmountUSD varchar(255) 97 | ); 98 | 99 | -------------------------------------------------------------------------------- /SQLR/Step10A_Evaluation.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to generate fraud account level metrics 3 | 4 | parameters: 5 | @table= the scored data to be evaluated 6 | */ 7 | 8 | set ansi_nulls on 9 | go 10 | 11 | set quoted_identifier on 12 | go 13 | 14 | DROP PROCEDURE IF EXISTS dbo.EvaluateR 15 | GO 16 | 17 | create procedure dbo.EvaluateR @table nvarchar(max) 18 | as 19 | begin 20 | 21 | /* create table to store the result */ 22 | if exists 23 | (select * from sysobjects where name like 'Performance') 24 | truncate table Performance 25 | else 26 | create table Performance ( 27 | ADR varchar(255), 28 | PCT_NF_Acct varchar(255), 29 | Dol_Frd varchar(255), 30 | Do_NF varchar(255), 31 | VDR varchar(255), 32 | Acct_FP varchar(255), 33 | PCT_Frd varchar(255), 34 | PCT_NF varchar(255), 35 | AFPR varchar(255), 36 | TFPR varchar(255) 37 | ); 38 | 39 | /* specify the query to select data to be evaluated. this query will be used as input for following R script */ 40 | declare @GetScoreData nvarchar(max) 41 | set @GetScoreData = 'select accountID, transactionDateTime, transactionAmountUSD, label, [Probability.1] from ' + @table + ' order by accountID, transactionDateTime' 42 | 43 | /* R script to generate account level metrics */ 44 | insert into Performance 45 | exec sp_execute_external_script @language = N'R', 46 | @script = N' 47 | #################################################################################################### 48 | ## Fraud account level metrics 49 | #################################################################################################### 50 | # Implement account-level performance metrics and transaction-level metrics. 51 | # ADR -- Fraud account detection rate 52 | # VDR -- Value detection rate. The percentage of values saved. 53 | # AFPR -- Account-level false positive ratio. 54 | # ROC -- Transaction-level ROC 55 | # $ROC -- Dollar weighted ROC 56 | # TFPR -- Transaction level false positive ratio. 57 | # sampling rate are taken into consideration to derive performance on original unsampled dataset. 58 | # contactPeriod is in the unit of days, indicating the lag before a customer is contacted again 59 | # to verify high-score transactions are legitimate. 60 | scr2stat <-function(dataset, contactPeriod, sampleRateNF,sampleRateFrd) 61 | { 62 | #scr quantization/binning into 1000 equal bins 63 | 64 | #accout level score is the maximum of trans scores of that account 65 | #all transactions after the first fraud transaction detected are value savings 66 | #input score file needs to be acct-date-time sorted 67 | dataset$"Scored Probabilities" <- dataset$Probability.1 68 | 69 | fields = names(dataset) 70 | if(! ("accountID" %in% fields)) 71 | {print ("Error: Need accountID column!")} 72 | if(! ("transactionDateTime" %in% fields)) 73 | {print ("Error: Need transactionDateTime column!")} 74 | if(! ("transactionAmountUSD" %in% fields)) 75 | {print ("Error: Need transactionAmountUSD column!")} 76 | if(! ("Scored Probabilities" %in% fields)) 77 | {print ("Error: Need Scored Probabilities column!")} 78 | 79 | nRows = dim(dataset)[1]; 80 | 81 | nBins = 1000; 82 | 83 | #1. first calculate the perf stats by score band 84 | 85 | prev_acct =dataset$accountID[1] 86 | prev_score = 0 87 | is_frd_acct = 0 88 | max_scr = 0 89 | 90 | 91 | scr_hash=matrix(0, nBins,10) 92 | 93 | f_scr_rec = vector("numeric",nBins) 94 | #nf_scr_rec = matrix(0, nBins,2) #count, datetime 95 | nf_scr_rec_count = vector("numeric",nBins) 96 | nf_scr_rec_time = vector("numeric",nBins) 97 | 98 | 99 | for (r in 1:nRows) 100 | 101 | 102 | { 103 | acct = as.character(dataset$accountID[r]) 104 | dolamt = as.double(dataset$transactionAmountUSD[r]) 105 | label = dataset$label[r] 106 | score = dataset$"Scored Probabilities"[r] 107 | datetime = dataset$transactionDateTime[r] 108 | 109 | if(score == 0) 110 | { 111 | score = score + 0.00001 112 | print ("The following account has zero score!") 113 | print (paste(acct,dolamt,datetime,sep=" ")); 114 | } 115 | 116 | if(label == 2) next 117 | 118 | 119 | if (acct != prev_acct){ 120 | scr_bin = ceiling(max_scr*nBins) 121 | 122 | 123 | if (is_frd_acct) { 124 | scr_hash[,5] = scr_hash[,5] + f_scr_rec #vdr 125 | scr_hash[scr_bin,1] = scr_hash[scr_bin,1] + 1 #adr 126 | } 127 | else { 128 | scr_hash[,6] = scr_hash[,6] + as.numeric(nf_scr_rec_count) #FP with contact period, a FP could be considered as multiple 129 | scr_hash[scr_bin,2] = scr_hash[scr_bin,2]+1; #a FP account considered one acct 130 | } 131 | 132 | f_scr_rec = vector("numeric",nBins) 133 | 134 | nf_scr_rec_count = vector("numeric",nBins) 135 | nf_scr_rec_time = vector("numeric",nBins) 136 | 137 | is_frd_acct = 0; 138 | total_nf_dol = 0; 139 | total_frd_dol = 0; 140 | max_scr = 0; 141 | } 142 | 143 | if (score > max_scr) { 144 | max_scr = score; 145 | } 146 | 147 | #find out the bin the current acct falls in. 148 | tran_scr_bin = ceiling(score*nBins) 149 | 150 | 151 | #dollar weighted ROC and regular ROC 152 | if(label == 1){ 153 | scr_hash[tran_scr_bin,3] = scr_hash[tran_scr_bin,3]+dolamt; 154 | scr_hash[tran_scr_bin,7] = scr_hash[tran_scr_bin,7]+1; 155 | is_frd_acct = 1; 156 | } 157 | else{ 158 | scr_hash[tran_scr_bin,4] = scr_hash[tran_scr_bin,4]+dolamt; 159 | scr_hash[tran_scr_bin,8] = scr_hash[tran_scr_bin,8]+1; 160 | } 161 | 162 | #ADR/VDR 163 | if(label == 1) 164 | { 165 | #ADR 166 | f_scr_rec[tran_scr_bin] = 1 167 | 168 | #VDR 169 | #If a higher score appeared before the current score, then this is also savings for the higher score. 170 | #Once a fraud transaction is discovered, all subsequent approved transactons are savings. 171 | for(i in 1: ceiling(max_scr*nBins)) 172 | { 173 | f_scr_rec[i] = f_scr_rec[i] + dolamt 174 | } 175 | } 176 | else 177 | { 178 | #False Positive Accounts (FP) with recontact period 179 | #check if there is any earlier dates for the same or lower score 180 | #update the count and dates when within recontact period 181 | 182 | #for(i in 1: floor(max_scr*nBins)) 183 | for(i in 1: tran_scr_bin) 184 | { 185 | 186 | prev_time = nf_scr_rec_time[i] 187 | #print(paste(i, tran_scr_bin, sep=" ")) 188 | #print(paste(acct, datetime, sep=" ")) 189 | #print(prev_time) 190 | if( prev_time > 0) 191 | { 192 | timeDiff = difftime(strptime(datetime,"%Y-%m-%d %H:%M:%S"),strptime(prev_time,"%Y-%m-%d %H:%M:%S"), units="days" ) 193 | if(timeDiff >= contactPeriod) 194 | { 195 | nf_scr_rec_count[i] = nf_scr_rec_count[i] +1 196 | nf_scr_rec_time[i] = datetime 197 | } 198 | } 199 | else 200 | { 201 | nf_scr_rec_count[i] = nf_scr_rec_count[i] +1 202 | nf_scr_rec_time[i] = datetime 203 | } 204 | 205 | } 206 | 207 | } 208 | 209 | prev_acct = acct; 210 | 211 | } 212 | 213 | 214 | #1 -- #Frd Acct 215 | #2 -- #NF Acct with infinite recontact period 216 | #3 -- $Frd Tran 217 | #4 -- $NF Tran 218 | #5 -- $Frd Saving 219 | #6 -- #NF Acct with finite recontact period 220 | #7 -- #Frd Tran 221 | #8 -- #NF Tran 222 | #9 -- AFPR 223 | #10 --TFPR 224 | 225 | #2. now calculate the cumulative perf counts 226 | 227 | # 5, 6 already in cumulative during previous calculation 228 | 229 | for (i in (nBins-1):1){ 230 | 231 | for(j in c(1:4,7:8)){ 232 | scr_hash[i,j] = scr_hash[i,j]+scr_hash[i+1,j]; 233 | } 234 | } 235 | 236 | #3 calculate AFPR, TFPR: 237 | scr_hash[,9] = scr_hash[,6]/(scr_hash[,1]+0.0001) 238 | scr_hash[,10] = scr_hash[,8]/(scr_hash[,7]+0.0001) 239 | 240 | #print(scr_hash) 241 | 242 | #4. now calculate the ADR/VDR, ROC percentage 243 | for(j in c(1:5,7:8)){ 244 | scr_hash[,j] = scr_hash[,j]/scr_hash[1,j]; 245 | } 246 | 247 | #5. Adjust for sampling rate 248 | for (j in c(1, 3, 5 ,7)) 249 | { 250 | scr_hash[,j]= scr_hash[,j]/sampleRateFrd 251 | } 252 | 253 | for (j in c(2, 4, 6 ,8)) 254 | { 255 | scr_hash[,j]= scr_hash[,j]/sampleRateNF 256 | } 257 | 258 | for (j in c(9, 10)) 259 | { 260 | scr_hash[,j]= scr_hash[,j]/sampleRateNF*sampleRateFrd 261 | } 262 | 263 | 264 | perf.df = as.data.frame(scr_hash) 265 | colnames(perf.df) = c(''ADR'',''PCT NF Acct'',''Dol Frd'', ''Dol NF'', ''VDR'', ''Acct FP(recontact period)'', ''PCT Frd'', ''PCT NF'',''AFPR'',''TFPR'') 266 | return (perf.df) 267 | } 268 | scored_data <- InputDataSet 269 | scored_data$transactionDateTime <- as.character(scored_data$transactionDateTime) 270 | perf <- scr2stat(scored_data,contactPeriod=30, sampleRateNF=1,sampleRateFrd=1) 271 | OutputDataSet <- as.data.frame(perf) 272 | ', 273 | @input_data_1 = @GetScoreData 274 | ; 275 | end -------------------------------------------------------------------------------- /SQLR/Step10B_Evaluation_AUC.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to calculate AUC 3 | 4 | parameters: 5 | @table= the scored data to be evaluated 6 | */ 7 | 8 | set ansi_nulls on 9 | go 10 | 11 | set quoted_identifier on 12 | go 13 | 14 | DROP PROCEDURE IF EXISTS dbo.EvaluateR_auc 15 | GO 16 | 17 | create procedure dbo.EvaluateR_auc @table nvarchar(max) 18 | as 19 | begin 20 | 21 | /* create table to store AUC value */ 22 | if exists 23 | (select * from sysobjects where name like 'Performance_Auc') 24 | truncate table Performance_Auc 25 | else 26 | create table Performance_Auc ( 27 | AUC float 28 | ); 29 | 30 | /* specify the query to select data to be evaluated. this query will be used as input for following R script */ 31 | declare @GetScoreData nvarchar(max) 32 | set @GetScoreData = 'select * from ' + @table 33 | 34 | /* R script to calculate AUC */ 35 | insert into Performance_Auc 36 | exec sp_execute_external_script @language = N'R', 37 | @script = N' 38 | Predictions <- InputDataSet 39 | Predictions$label <- as.numeric(as.character(Predictions$label)) 40 | 41 | # Compute the AUC. 42 | ROC <- rxRoc(actualVarName = "label", predVarNames = "Probability.1", data = Predictions, numBreaks = 1000) 43 | AUC <- rxAuc(ROC) 44 | OutputDataSet <- as.data.frame(AUC) 45 | ', 46 | @input_data_1 = @GetScoreData 47 | ; 48 | end -------------------------------------------------------------------------------- /SQLR/Step1_MergeAcctInfo.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to merge untagged transactions with account level infomations 3 | */ 4 | 5 | set ansi_nulls on 6 | go 7 | 8 | set quoted_identifier on 9 | go 10 | 11 | DROP PROCEDURE IF EXISTS MergeAcctInfo 12 | GO 13 | 14 | create procedure MergeAcctInfo @table nvarchar(max) 15 | as 16 | begin 17 | 18 | declare @droptable nvarchar(max) 19 | set @droptable = 'drop table if exists ' + @table + '_Acct' 20 | exec sp_executesql @droptable 21 | 22 | /* Merge with AccountInfo_Sort table */ 23 | declare @MergeQuery nvarchar(max) 24 | set @MergeQuery = 25 | ' 26 | select t1.*, 27 | t2.accountOwnerName, 28 | t2.accountAddress, 29 | t2.accountPostalCode, 30 | t2.accountCity, 31 | t2.accountState, 32 | t2.accountCountry, 33 | t2.accountOpenDate, 34 | t2.accountAge, 35 | t2.isUserRegistered, 36 | t2.paymentInstrumentAgeInAccount, 37 | t2.numPaymentRejects1dPerUser 38 | into ' + @table + '_Acct ' + 39 | 'from 40 | (select *, 41 | convert(datetime,stuff(stuff(stuff(concat(transactionDate,dbo.FormatTime(transactionTime)), 9, 0, '' ''), 12, 0, '':''), 15, 0, '':'')) as transactionDateTime 42 | from ' + @table + ') as t1 43 | outer apply 44 | (select top 1 * -- the top 1 is the maximum transactionDateTime up to current transactionDateTime 45 | from Account_Info_Sort as t 46 | where t.accountID = t1.accountID and t.recordDateTime <= t1.transactionDateTime) as t2 47 | where t1.accountID = t2.accountID 48 | ' 49 | 50 | exec sp_executesql @MergeQuery 51 | end -------------------------------------------------------------------------------- /SQLR/Step2_Tagging.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to do the followings: 3 | 1. uniform transactionTime to 6 digits 4 | 2. remove duplicated rows 5 | 3. tagging on account level 6 | 7 | input parameters: 8 | @untaggedtable = table of untagged transactions 9 | @fraudtable = table of fraud transactions 10 | */ 11 | 12 | set ansi_nulls on 13 | go 14 | 15 | set quoted_identifier on 16 | go 17 | 18 | DROP PROCEDURE IF EXISTS Tagging 19 | GO 20 | 21 | create procedure Tagging 22 | @untaggedtable varchar(max), 23 | @fraudtable varchar(max) 24 | as 25 | begin 26 | 27 | DROP TABLE IF EXISTS Tagged; 28 | 29 | /***********************************************************************/ 30 | /* reformat transactionTime and create transactionDateTime for fraud transactions*/ 31 | /**********************************************************************/ 32 | /* ##table is a global temporary table which will be written only once to temporary database */ 33 | declare @maketransactionDateTime nvarchar(max) 34 | set @maketransactionDateTime = 35 | ' 36 | select *, 37 | convert(datetime,stuff(stuff(stuff(concat(transactionDate,dbo.FormatTime(transactionTime)), 9, 0, '' ''), 12, 0, '':''), 15, 0, '':'')) as transactionDateTime 38 | into ##Formatted_Fraud 39 | from ' + @fraudtable 40 | 41 | exec sp_executesql @maketransactionDateTime 42 | /*****************************************************************************************************************/ 43 | /* remove duplicate based on keys: transactionID, accountID, transactionDate, transactionDate, transactionAmount */ 44 | /*****************************************************************************************************************/ 45 | /* sometimes an entire transaction might be divided into multiple sub-transactions. thus, even transactionID, accountID, transactionDate/Time are same, the amount might be different */ 46 | declare @removeduplicates1 nvarchar(max) 47 | set @removeduplicates1 = 48 | ';WITH cte_1 49 | AS (SELECT ROW_NUMBER() OVER (PARTITION BY transactionID, accountID, transactionDateTime, transactionAmount 50 | ORDER BY transactionID ASC) RN 51 | FROM ' + @untaggedtable + ') 52 | DELETE FROM cte_1 53 | WHERE RN > 1;' 54 | exec sp_executesql @removeduplicates1 55 | 56 | ;WITH cte_2 57 | AS (SELECT ROW_NUMBER() OVER (PARTITION BY transactionID, accountID, transactionDate, transactionDateTime, transactionAmount 58 | ORDER BY transactionID ASC) RN 59 | FROM ##Formatted_Fraud) 60 | DELETE FROM cte_2 61 | WHERE RN > 1; 62 | 63 | 64 | /*********************************************************************************************************************/ 65 | /* tagging on account level: 66 | if accountID can't be found in fraud dataset => tag as 0, non fraud 67 | if accountID found in fraud dataset but transactionDateTime is out of the fraud time range => tag as 2, pre-fraud 68 | if accountID found in fraud dataset and transactionDateTime is within the fraud time range => tag as 1, fraud */ 69 | /**********************************************************************************************************************/ 70 | /* convert fraud to account level and create start and end date time */ 71 | select accountID, min(transactionDateTime) as startDateNTime, max(transactionDateTime) as endDateNTime 72 | into ##Fraud_Account 73 | from ##Formatted_Fraud 74 | group by accountID 75 | 76 | 77 | /* Tagging */ 78 | declare @tagging nvarchar(max) 79 | set @tagging = 80 | 'select t.*, 81 | case 82 | when (sDT is not null and tDT >= sDT and tDT <= eDT) then 1 83 | when (sDT is not null and tDT < sDT) then 2 84 | when (sDT is not null and tDT > eDT) then 2 85 | when sDT is null then 0 86 | end as label 87 | into Tagged 88 | from 89 | (select t1.*, 90 | t1.transactionDateTime as tDT, 91 | t2.startDateNTime as sDT, 92 | t2.endDateNTime as eDT 93 | from ' + @untaggedtable + ' as t1 94 | left join 95 | ##Fraud_Account as t2 96 | on t1.accountID = t2.accountID 97 | ) t' 98 | exec sp_executesql @tagging 99 | 100 | drop table ##Fraud_Account 101 | drop table ##Formatted_Fraud 102 | end -------------------------------------------------------------------------------- /SQLR/Step3_SplitData.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to split data on account level 3 | 4 | input parameter: 5 | @table = table to be splitted 6 | */ 7 | 8 | set ansi_nulls on 9 | go 10 | 11 | set quoted_identifier on 12 | go 13 | 14 | DROP PROCEDURE IF EXISTS SplitData 15 | GO 16 | 17 | create procedure SplitData @table varchar(max) 18 | as 19 | begin 20 | 21 | 22 | /* hash accountID into 100 bins and split */ 23 | 24 | declare @hashacctNsplit nvarchar(max) 25 | set @hashacctNsplit =' 26 | DROP TABLE IF EXISTS Tagged_Training 27 | DROP TABLE IF EXISTS Tagged_Testing 28 | DROP TABLE IF EXISTS Hash_Id 29 | 30 | select accountID, 31 | abs(CAST(CAST(HashBytes(''MD5'', accountID) AS VARBINARY(64)) AS BIGINT) % 100) as hashCode 32 | into Hash_Id 33 | from ' + @table + ' 34 | 35 | select * into Tagged_Training 36 | from ' +@table + ' 37 | where accountID in (select accountID from Hash_Id where hashCode <= 70) 38 | 39 | select * into Tagged_Testing 40 | from ' +@table + ' 41 | where accountID in (select accountID from Hash_Id where hashCode > 70)' 42 | 43 | exec sp_executesql @hashacctNsplit 44 | 45 | end -------------------------------------------------------------------------------- /SQLR/Step4_Preprocess.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to do preprocessing including: 3 | 1. fill missing values with 0 4 | 2. remove transactions with negative transaction amount 5 | 3. remove transactions with invalide transactionData and time 6 | 4. remove prefraud: label == 2 7 | 8 | input parameters: 9 | @table = table need to be preprocessed 10 | */ 11 | 12 | set ansi_nulls on 13 | go 14 | 15 | set quoted_identifier on 16 | go 17 | 18 | DROP PROCEDURE IF EXISTS Preprocess 19 | GO 20 | 21 | create procedure Preprocess @table nvarchar(max) 22 | as 23 | begin 24 | 25 | /* drop view if exists */ 26 | declare 27 | @sql_dropview nvarchar(max) = ''; 28 | set @sql_dropview = ' 29 | DROP VIEW IF EXISTS ' + @table + '_Processed' 30 | exec sp_executesql @sql_dropview; 31 | 32 | /* create a veiw to do preprocessing */ 33 | declare @sql_process nvarchar(max) = ''; 34 | set @sql_process = ' 35 | create view ' + @table + '_Processed as 36 | select 37 | label, 38 | accountID, 39 | transactionID, 40 | transactionDateTime, 41 | isnull(isProxyIP, ''0'') as isProxyIP, 42 | isnull(paymentInstrumentType, ''0'') as paymentInstrumentType, 43 | isnull(cardType, ''0'') as cardType, 44 | isnull(paymentBillingAddress, ''0'') as paymentBillingAddress, 45 | isnull(paymentBillingPostalCode, ''0'') as paymentBillingPostalCode, 46 | isnull(paymentBillingCountryCode, ''0'') as paymentBillingCountryCode, 47 | isnull(paymentBillingName, ''0'') as paymentBillingName, 48 | isnull(accountAddress, ''0'') as accountAddress, 49 | isnull(accountPostalCode, ''0'') as accountPostalCode, 50 | isnull(accountCountry, ''0'') as accountCountry, 51 | isnull(accountOwnerName, ''0'') as accountOwnerName, 52 | isnull(shippingAddress, ''0'') as shippingAddress, 53 | isnull(transactionCurrencyCode, ''0'') as transactionCurrencyCode, 54 | isnull(localHour,''-99'') as localHour, 55 | isnull(ipState, ''0'') as ipState, 56 | isnull(ipPostCode, ''0'') as ipPostCode, 57 | isnull(ipCountryCode, ''0'') as ipCountryCode, 58 | isnull(browserLanguage, ''0'') as browserLanguage, 59 | isnull(paymentBillingState, ''0'') as paymentBillingState, 60 | isnull(accountState, ''0'') as accountState, 61 | case when isnumeric(transactionAmountUSD)=1 then cast(transactionAmountUSD as float) else 0 end as transactionAmountUSD, 62 | case when isnumeric(digitalItemCount)=1 then cast(digitalItemCount as float) else 0 end as digitalItemCount, 63 | case when isnumeric(physicalItemCount)=1 then cast(physicalItemCount as float) else 0 end as physicalItemCount, 64 | case when isnumeric(accountAge)=1 then cast(accountAge as float) else 0 end as accountAge, 65 | case when isnumeric(paymentInstrumentAgeInAccount)=1 then cast(paymentInstrumentAgeInAccount as float) else 0 end as paymentInstrumentAgeInAccount, 66 | case when isnumeric(numPaymentRejects1dPerUser)=1 then cast(numPaymentRejects1dPerUser as float) else 0 end as numPaymentRejects1dPerUser, 67 | isUserRegistered = case when isUserRegistered like ''%[0-9]%'' then ''0'' else isUserRegistered end 68 | from ' + @table + ' 69 | where cast(transactionAmountUSD as float) >= 0 and 70 | (case when transactionDateTime is null then 1 else 0 end) = 0 and 71 | label < 2' 72 | 73 | exec sp_executesql @sql_process 74 | end 75 | 76 | 77 | -------------------------------------------------------------------------------- /SQLR/Step5_Save2History.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to do the followings: 3 | 1. truncate historical table if truncateflag = '1' 4 | 2. save transactions to historical table 5 | 6 | input parameters: 7 | @table = table of transactions wanted to save into historical table 8 | @truncateflag = indicate whether the historical table need to be truncated: '1'=yes, '0'=no 9 | */ 10 | 11 | set ansi_nulls on 12 | go 13 | 14 | set quoted_identifier on 15 | go 16 | 17 | DROP PROCEDURE IF EXISTS Save2TransactionHistory 18 | GO 19 | 20 | create procedure Save2TransactionHistory @table nvarchar(max), 21 | @truncateflag nvarchar(max) 22 | as 23 | begin 24 | 25 | /* truncate historical table if truncateflag = '1' */ 26 | declare @truncatetable nvarchar(max) = ''; 27 | set @truncatetable = 'if cast(' + @truncateflag + ' as int) = 1 truncate table Transaction_History' 28 | exec sp_executesql @truncatetable 29 | 30 | /* insert transactions into historical table */ 31 | declare @sql_save2history nvarchar(max) = ''; 32 | set @sql_save2history =' 33 | insert into Transaction_History 34 | select accountID, transactionID, transactionDateTime, transactionAmountUSD from ' + @table + ';' 35 | exec sp_executesql @sql_save2history 36 | 37 | end -------------------------------------------------------------------------------- /SQLR/Step6_CreateRiskTables.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to create all risk tables 3 | */ 4 | 5 | set ansi_nulls on 6 | go 7 | 8 | set quoted_identifier on 9 | go 10 | 11 | DROP PROCEDURE IF EXISTS CreateRiskTable_ForAll 12 | GO 13 | 14 | create procedure CreateRiskTable_ForAll 15 | as 16 | begin 17 | 18 | /* create a table to store names of variables and risk tables. will be used as reference in the loop later */ 19 | if exists 20 | (select * from sysobjects where name like 'Risk_Var') 21 | truncate table Risk_Var 22 | else 23 | create table dbo.Risk_Var (ID int,var_names varchar(255), table_names varchar(255)); 24 | 25 | insert into Risk_Var values (1, 'transactionCurrencyCode', 'Risk_TransactionCurrencyCode'); 26 | insert into Risk_Var values (2, 'localHour', 'Risk_LocalHour'); 27 | insert into Risk_Var values (3, 'ipState', 'Risk_IpState'); 28 | insert into Risk_Var values (4, 'ipPostCode', 'Risk_IpPostCode'); 29 | insert into Risk_Var values (5, 'ipCountryCode', 'Risk_IpCountryCode'); 30 | insert into Risk_Var values (6, 'browserLanguage', 'Risk_BrowserLanguage'); 31 | insert into Risk_Var values (7, 'paymentBillingPostalCode', 'Risk_PaymentBillingPostalCode'); 32 | insert into Risk_Var values (8, 'paymentBillingState', 'Risk_PaymentBillingState'); 33 | insert into Risk_Var values (9, 'paymentBillingCountryCode', 'Risk_PaymentBillingCountryCode'); 34 | insert into Risk_Var values (10, 'accountPostalCode', 'Risk_AccountPostalCode'); 35 | insert into Risk_Var values (11, 'accountState', 'Risk_AccountState'); 36 | insert into Risk_Var values (12, 'accountCountry', 'Risk_AccountCountry'); 37 | 38 | /* create all risk tables by looping over all variables in reference table and executing CreateRiskTable stored procedure */ 39 | DECLARE @name_1 NVARCHAR(100) 40 | DECLARE @name_2 NVARCHAR(100) 41 | DECLARE @getname CURSOR 42 | 43 | SET @getname = CURSOR FOR 44 | SELECT var_names, 45 | table_names 46 | FROM Risk_Var 47 | OPEN @getname 48 | FETCH NEXT 49 | FROM @getname INTO @name_1,@name_2 50 | WHILE @@FETCH_STATUS = 0 51 | BEGIN 52 | EXEC CreateRiskTable @name_1,@name_2 -- create risk table by calling stored procedure CreateRiskTable 53 | FETCH NEXT 54 | FROM @getname INTO @name_1, @name_2 55 | END 56 | 57 | CLOSE @getname 58 | DEALLOCATE @getname 59 | end -------------------------------------------------------------------------------- /SQLR/Step7_FeatureEngineer.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to do the following feature engineering: 3 | 1. create mismatch flags 4 | 2. convert categorical variables to numerical by assigning risk values based on risk tables 5 | 3. calculate aggregates 6 | 7 | input parameters: 8 | @table = the table need to be feature engineered 9 | */ 10 | 11 | set ansi_nulls on 12 | go 13 | 14 | set quoted_identifier on 15 | go 16 | 17 | DROP PROCEDURE IF EXISTS FeatureEngineer 18 | GO 19 | 20 | create procedure FeatureEngineer @table nvarchar(max) 21 | as 22 | begin 23 | 24 | /* create mismatch flags and assign risk values */ 25 | declare 26 | @sql_dropview1 nvarchar(max) = ''; 27 | set @sql_dropview1 = ' 28 | DROP VIEW IF EXISTS ' + @table + '_Features1' 29 | exec sp_executesql @sql_dropview1; 30 | 31 | declare @sql_fe1 nvarchar(max) = ''; 32 | set @sql_fe1 = 'create view ' + @table + '_Features1 as 33 | select t.label,t.accountID,t.transactionID,t.transactionDateTime, 34 | t.transactionAmountUSD, 35 | t.digitalItemCount, 36 | t.physicalItemCount, 37 | t.isProxyIP, 38 | t.paymentInstrumentType, 39 | t.cardType, 40 | t.isUserRegistered, 41 | t.accountAge, 42 | t.paymentInstrumentAgeInAccount, 43 | t.numPaymentRejects1dPerUser, 44 | case when t.transactionAmountUSD > 150 then ''1'' else ''0'' end as isHighAmount, 45 | case when t.paymentBillingAddress = t.accountAddress then ''0'' else ''1'' end as acctBillingAddressMismatchFlag, 46 | case when t.paymentBillingPostalCode = t.accountPostalCode then ''0'' else ''1'' end as acctBillingPostalCodeMismatchFlag, 47 | case when t.paymentBillingCountryCode = t.accountCountry then ''0'' else ''1'' end as acctBillingCountryMismatchFlag, 48 | case when t.paymentBillingName = t.accountOwnerName then ''0'' else ''1'' end as acctBillingNameMismatchFlag, 49 | case when t.shippingAddress = t.accountAddress then ''0'' else ''1'' end as acctShippingAddressMismatchFlag, 50 | case when t.shippingAddress = t.paymentBillingAddress then ''0'' else ''1'' end as shippingBillingAddressMismatchFlag, 51 | isnull(ac.risk,0) as accountCountryRisk, 52 | isnull(apc.risk,0) as accountPostalCodeRisk, 53 | isnull(actst.risk,0) as accountStateRisk, 54 | isnull(bl.risk,0) as browserLanguageRisk, 55 | isnull(ic.risk,0) as ipCountryCodeRisk, 56 | isnull(ipc.risk,0) as ipPostCodeRisk, 57 | isnull(ips.risk,0) as ipStateRisk, 58 | isnull(lh.risk,0) as localHourRisk, 59 | isnull(pbcc.risk,0) as paymentBillingCountryCodeRisk, 60 | isnull(pbpc.risk,0) as paymentBillingPostalCodeRisk, 61 | isnull(pbst.risk,0) as paymentBillingStateRisk, 62 | isnull(tcc.risk,0) as transactionCurrencyCodeRisk 63 | from ' +@table + ' as t 64 | left join Risk_AccountCountry as ac on ac.accountCountry = t.accountCountry 65 | left join Risk_AccountPostalCode as apc on apc.accountPostalCode = t.accountPostalCode 66 | left join Risk_AccountState as actst on actst.accountState = t.accountState 67 | left join Risk_BrowserLanguage as bl on bl.browserLanguage = t.browserLanguage 68 | left join Risk_IpCountryCode as ic on ic.ipCountryCode = t.ipCountryCode 69 | left join Risk_IpPostCode as ipc on ipc.ipPostCode = t.ipPostCode 70 | left join Risk_IpState as ips on ips.ipState = t.ipState 71 | left join Risk_LocalHour as lh on lh.localHour = t.localHour 72 | left join Risk_PaymentBillingCountryCode as pbcc on pbcc.paymentBillingCountryCode = t.paymentBillingCountryCode 73 | left join Risk_PaymentBillingPostalCode as pbpc on pbpc.paymentBillingPostalCode = t.paymentBillingPostalCode 74 | left join Risk_PaymentBillingState as pbst on pbst.paymentBillingState = t.paymentBillingState 75 | left join Risk_TransactionCurrencyCode as tcc on tcc.transactionCurrencyCode = t.transactionCurrencyCode 76 | ' 77 | exec sp_executesql @sql_fe1; 78 | 79 | /* create aggregates on the fly */ 80 | declare 81 | @sql_dropview nvarchar(max) = ''; 82 | set @sql_dropview = ' 83 | DROP VIEW IF EXISTS ' + @table + '_Features' 84 | exec sp_executesql @sql_dropview; 85 | 86 | declare @sql_fe nvarchar(max) = ''; 87 | set @sql_fe = 'create view ' + @table + '_Features as 88 | select * from ' + @table + '_Features1 as t 89 | outer apply 90 | (select 91 | isnull(sum(case when t2.transactionDateTime > last24Hours then cast(t2.transactionAmountUSD as float) end),0) as sumPurchaseAmount1dPerUser, 92 | isnull(count(case when t2.transactionDateTime > last24Hours then t2.transactionAmountUSD end),0) as sumPurchaseCount1dPerUser, 93 | isnull(sum(cast(t2.transactionAmountUSD as float)),0) as sumPurchaseAmount30dPerUser, 94 | isnull(count(t2.transactionAmountUSD),0) as sumPurchaseCount30dPerUser 95 | from Transaction_History as t2 96 | cross apply (values(t.transactionDateTime, DATEADD(hour, -24, t.transactionDateTime), DATEADD(day, -30, t.transactionDateTime))) as c(transactionDateTime, last24Hours, last30Days) 97 | where t2.accountID = t.accountID and t2.transactionDateTime < t.transactionDateTime and t2.transactionDateTime > last30Days 98 | ) as a1' 99 | 100 | exec sp_executesql @sql_fe; 101 | end 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /SQLR/Step8_Training.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create stored procedure to do the following: 3 | 1. down sample the majority 4 | 2. train a gradient boosted tree model 5 | 3. save the trained model into a sql table 6 | 7 | input parameters: 8 | @table = the table used as training set 9 | */ 10 | 11 | set ansi_nulls on 12 | go 13 | 14 | set quoted_identifier on 15 | go 16 | 17 | DROP PROCEDURE IF EXISTS TrainModelR 18 | GO 19 | 20 | create procedure TrainModelR @table nvarchar(max) 21 | as 22 | begin 23 | 24 | /* Create an empty table to be filled with the trained models */ 25 | if exists 26 | (select * from sysobjects where name like 'Trained_Model') 27 | truncate table Trained_Model 28 | else 29 | create table Trained_Model ( 30 | id varchar(200) not null, 31 | value varbinary(max) 32 | --,constraint unique_id3 unique(id) 33 | ); 34 | 35 | /* down sample the majority by: 36 | 1. sort the data by label and accountID in descent order 37 | 2. select the top 10000 rows 38 | */ 39 | declare @GetTrainData nvarchar(max) 40 | set @GetTrainData = 'select * from ' + @table 41 | 42 | /*Get the database name*/ 43 | DECLARE @database_name nvarchar(max) = db_name(); 44 | 45 | /* R script to train GBT model and save the trained model into a sql table */ 46 | execute sp_execute_external_script 47 | @language = N'R', 48 | @script = N' 49 | # define the connection string 50 | connection_string <- paste("Driver=SQL Server;Server=localhost;Database=", database_name, ";Trusted_Connection=true;", sep="") 51 | 52 | # Set the Compute Context to SQL for faster training. 53 | sql <- RxInSqlServer(connectionString = connection_string) 54 | rxSetComputeContext(sql) 55 | 56 | ## Point to testing data in sql server 57 | train_sql <- RxSqlServerData(sqlQuery = sprintf("%s", inquery), 58 | connectionString = connection_string, 59 | stringsAsFactors = TRUE) 60 | 61 | ## make equations 62 | variables_all <- rxGetVarNames(train_sql) 63 | variables_to_remove <- c("label", "accountID", "transactionID", "transactionDateTime") 64 | training_variables <- variables_all[!(variables_all %in% variables_to_remove)] 65 | equation <- paste("label ~ ", paste(training_variables, collapse = "+", sep=""), sep="") 66 | 67 | ## train GBT model 68 | library("MicrosoftML") 69 | boosted_fit <- rxFastTrees(formula = as.formula(equation), 70 | data = train_sql, 71 | type = c("binary"), 72 | numTrees = 100, 73 | learningRate = 0.2, 74 | splitFraction = 5/24, 75 | featureFraction = 1, 76 | minSplit = 10, 77 | unbalancedSets = TRUE, 78 | randomSeed = 5) 79 | 80 | ## save the trained model in sql server 81 | # set the compute context to local for tables exportation to SQL 82 | rxSetComputeContext("local") 83 | # Open an Odbc connection with SQL Server. 84 | OdbcModel <- RxOdbcData(table = "Trained_Model", connectionString = connection_string) 85 | rxOpen(OdbcModel, "w") 86 | # Write the model to SQL. 87 | rxWriteObject(OdbcModel, "Gradient Boosted Tree", boosted_fit) 88 | 89 | ' 90 | , @params = N' @inquery nvarchar(max), @database_name varchar(max)' 91 | , @inquery = @GetTrainData 92 | , @database_name = @database_name 93 | ; 94 | end -------------------------------------------------------------------------------- /SQLR/Step9_Prediction.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create the stored procedure to do the following: 3 | 1. uniform transactionTime to 6 digits if necessary 4 | 2. preprocess data 5 | 3. save transaction data to historical table 6 | 4. feature engineering 7 | 5. scoring 8 | 5. save the scored data set to a sql table 9 | 10 | input parameters: 11 | @inputtable = the table of data to be scored 12 | @outputtable = the table stores the scored data 13 | @getacctflag = the flag to indicate if merge with accountInfo table is needed: '1'=yes, '0'=no 14 | */ 15 | 16 | set ansi_nulls on 17 | go 18 | 19 | set quoted_identifier on 20 | go 21 | 22 | DROP PROCEDURE IF EXISTS PredictR 23 | GO 24 | 25 | create procedure PredictR @inputtable nvarchar(max), 26 | @outputtable nvarchar(max), 27 | @getacctflag nvarchar(max) 28 | as 29 | begin 30 | 31 | /* merge with accountInfo table if getacctflag = '1' */ 32 | declare @mergeacct nvarchar(max) = ''; 33 | set @mergeacct = 'if cast(' + @getacctflag + ' as int) = 1 34 | begin 35 | EXEC MergeAcctInfo ' + @inputtable + ' 36 | end' 37 | exec sp_executesql @mergeacct 38 | 39 | /* select @inputtable into @table_acct if getacctflag = '0' */ 40 | declare @renametable nvarchar(max) = ''; 41 | set @renametable = 42 | 'if cast(' + @getacctflag + ' as int) = 0 43 | begin 44 | drop table if exists ' + @inputtable + '_Acct 45 | select * into ' + @inputtable + '_Acct from ' + @inputtable + ' 46 | end' 47 | exec sp_executesql @renametable 48 | 49 | /* add a fake label if label doesn't exist */ 50 | declare @addlabel nvarchar(max) = ''; 51 | set @addlabel = ' 52 | IF NOT EXISTS(SELECT 1 FROM sys.columns 53 | WHERE Name = N''label'' 54 | AND Object_ID = Object_ID(N''' + @inputtable + '_Acct'')) 55 | BEGIN 56 | alter table ' + @inputtable + '_Acct add label int not null default(-1) 57 | END' 58 | exec sp_executesql @addlabel 59 | 60 | /* preprocessing by calling the stored procedure 'Preprocess' */ 61 | declare @preprocess nvarchar(max) 62 | set @preprocess = 'exec Preprocess ' + @inputtable + '_Acct' 63 | exec sp_executesql @preprocess 64 | 65 | /* save transactions to history table */ 66 | declare @sql_save2history nvarchar(max) 67 | set @sql_save2history = 'exec Save2TransactionHistory ' + @inputtable + '_Acct_Processed, ''0''' 68 | exec sp_executesql @sql_save2history 69 | 70 | /* feature engineering by calling the stored procedure 'FeatureEngineer' */ 71 | declare @fe_query nvarchar(max) 72 | set @fe_query = 'exec FeatureEngineer ' + @inputtable + '_Acct_Processed' 73 | exec sp_executesql @fe_query 74 | 75 | /* specify the query to select data to be scored. This query will be used as input to following R script */ 76 | declare @GetData2Score nvarchar(max) 77 | set @GetData2Score = 'select * from ' + @inputtable + '_Acct_Processed_Features where label<=1'; 78 | 79 | /* Get the database name*/ 80 | DECLARE @database_name varchar(max) = db_name(); 81 | 82 | /* R script to do scoring and save scored dataset into sql table */ 83 | exec sp_execute_external_script @language = N'R', 84 | @script = N' 85 | ## Get the trained model 86 | # Define connectioin string 87 | connection_string <- paste("Driver=SQL Server;Server=localhost;Database=", database_name, ";Trusted_Connection=true;", sep="") 88 | # Create an Odbc connection with SQL Server using the name of the table storing the model 89 | OdbcModel <- RxOdbcData(table = "Trained_Model", connectionString = connection_string) 90 | # Read the model from SQL. 91 | boosted_fit <- rxReadObject(OdbcModel, "Gradient Boosted Tree") 92 | 93 | ## Point to testing data in sql server 94 | test_sql <- RxSqlServerData(sqlQuery = sprintf("%s", inquery), 95 | connectionString = connection_string, 96 | stringsAsFactors = TRUE) 97 | 98 | ## Specify the pointer to output table 99 | Predictions_gbt_sql <- RxSqlServerData(table = outputtable, connectionString = connection_string) 100 | 101 | ## Set the Compute Context to SQL. 102 | sql <- RxInSqlServer(connectionString = connection_string) 103 | #rxSetComputeContext(sql) 104 | 105 | ## Scoring 106 | library("MicrosoftML") 107 | rxPredict(modelObject = boosted_fit, 108 | data = test_sql, 109 | outData = Predictions_gbt_sql, 110 | overwrite = T, 111 | extraVarsToWrite = c("accountID", "transactionID", "transactionDateTime", "transactionAmountUSD", "label")) 112 | 113 | ' 114 | , @params = N' @inquery nvarchar(max), @database_name varchar(max), @outputtable nvarchar(max)' 115 | , @inquery = @GetData2Score 116 | , @database_name = @database_name 117 | , @outputtable = @outputtable 118 | ; 119 | end -------------------------------------------------------------------------------- /SQLR/UtilityFunctions.sql: -------------------------------------------------------------------------------- 1 | /* 2 | This script will create functions which will be used 3 | */ 4 | 5 | set ansi_nulls on 6 | go 7 | 8 | set quoted_identifier on 9 | go 10 | 11 | /* create the function to uniform transactionTime to 6 digits */ 12 | IF object_id(N'FormatTime', N'FN') IS NOT NULL 13 | DROP FUNCTION FormatTime 14 | GO 15 | 16 | create function dbo.FormatTime (@strTime varchar(255) ) 17 | returns varchar(255) 18 | as 19 | begin 20 | declare @strTimeNew varchar(255) 21 | set @strTimeNew = 22 | case 23 | when len(@strTime) = 5 then concat('0',@strTime) 24 | when len(@strTime) = 4 then concat('00',@strTime) 25 | when len(@strTime) = 3 then concat('000',@strTime) 26 | when len(@strTime) = 2 then concat('0000',@strTime) 27 | when len(@strTime) = 1 then concat('00000',@strTime) 28 | else @strTime 29 | end 30 | return(@strTimeNew) 31 | end 32 | go 33 | -------------------------------------------------------------------------------- /SQLR/createuser.sql: -------------------------------------------------------------------------------- 1 | :on error exit 2 | -- 3 | -- remove old $(username) user and login from master. 4 | -- $(username) and $(password) is substituted by Invoke-SqlCmd 5 | -- through environment variables. 6 | -- 7 | USE [master] 8 | GO 9 | IF EXISTS (SELECT name FROM sys.database_principals WHERE name = '$(username)') 10 | BEGIN 11 | PRINT 'Deleting old $(username) user from master' 12 | DROP USER [$(username)] 13 | END 14 | GO 15 | IF EXISTS (SELECT name FROM master.sys.server_principals WHERE name = '$(username)') 16 | BEGIN 17 | PRINT 'Deleting old $(username) login from master' 18 | DROP LOGIN [$(username)] 19 | END 20 | GO 21 | -- 22 | -- create new $(username) login in master 23 | -- 24 | USE [master] 25 | GO 26 | PRINT 'Creating $(username) login in master' 27 | CREATE LOGIN [$(username)] WITH PASSWORD=N'$(password)', CHECK_EXPIRATION=OFF, CHECK_POLICY=OFF; 28 | CREATE USER [$(username)] FOR LOGIN [$(username)] 29 | --ALTER ROLE [db_rrerole] ADD MEMBER [$(username)] 30 | ALTER ROLE [db_owner] ADD MEMBER [$(username)] 31 | GO 32 | 33 | exec sp_addrolemember 'db_owner', '$(username)' 34 | exec sp_addrolemember 'db_ddladmin', '$(username)' 35 | exec sp_addrolemember 'db_accessadmin', '$(username)' 36 | exec sp_addrolemember 'db_datareader', '$(username)' 37 | exec sp_addrolemember 'db_datawriter', '$(username)' 38 | exec sp_addsrvrolemember @loginame= '$(username)', @rolename = 'sysadmin' 39 | GO 40 | 41 | -- Enable implied authentification so a connection string can be automatically created in R codes embedded into SQL SP. 42 | USE [master] 43 | GO 44 | DECLARE @host_name nvarchar(100) 45 | SET @host_name = (SELECT HOST_NAME()) 46 | DECLARE @sql nvarchar(max); 47 | SELECT @sql = N' 48 | CREATE LOGIN [' + @host_name + '\SQLRUserGroup] FROM WINDOWS WITH DEFAULT_DATABASE=[master]'; 49 | EXEC sp_executesql @sql; 50 | 51 | 52 | -- Increase memory allocated to R. 53 | USE [master] 54 | GO 55 | SELECT * FROM sys.resource_governor_resource_pools WHERE name = 'default' 56 | SELECT * FROM sys.resource_governor_external_resource_pools WHERE name = 'default' 57 | ALTER EXTERNAL RESOURCE POOL "default" WITH (max_memory_percent = 100); 58 | ALTER RESOURCE GOVERNOR reconfigure; 59 | 60 | -------------------------------------------------------------------------------- /SQLR/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Fraud Detection 4 | ## Implemented on SQL Server 2016 R Services and HDInsight Spark 5 | 6 | > Discover more examples at [Microsoft Machine Learning Server](https://github.com/Microsoft/ML-Server) 7 | 8 | For all documentation, visit the [Fraud Detection website](https://microsoft.github.io/r-server-fraud-detection/). 9 | 10 | **NOTE:** Please don't use "Download ZIP" to get this repository, as it will change the line endings in the data files. Use "git clone" to get a local copy of this repository. 11 | 12 | # Contributing 13 | 14 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 15 | -------------------------------------------------------------------------------- /Website/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "r-sql", 3 | "version": "1.0.0", 4 | "description": "r sql server", 5 | "main": "server.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "keywords": [ 10 | "r", 11 | "sql", 12 | "server" 13 | ], 14 | "author": "Sean Wells ", 15 | "license": "MIT", 16 | "dependencies": { 17 | "express": "^4.13.4", 18 | "express-handlebars": "^3.0.0", 19 | "tedious": "^1.13.2" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Website/public/css/myCSS.css: -------------------------------------------------------------------------------- 1 | @charset "UTF-8"; 2 | body { 3 | padding-top: 10px; 4 | } 5 | .start { 6 | padding: 40px 15px; 7 | text-align: center; 8 | } 9 | .bdba{ 10 | color:#DD4814; 11 | } 12 | .form-control-inline { 13 | min-width: 0; 14 | width: auto; 15 | display: inline; 16 | } -------------------------------------------------------------------------------- /Website/public/fonts/glyphicons-halflings-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/fonts/glyphicons-halflings-regular.eot -------------------------------------------------------------------------------- /Website/public/fonts/glyphicons-halflings-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/fonts/glyphicons-halflings-regular.ttf -------------------------------------------------------------------------------- /Website/public/fonts/glyphicons-halflings-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/fonts/glyphicons-halflings-regular.woff -------------------------------------------------------------------------------- /Website/public/img/bracelet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/img/bracelet.jpg -------------------------------------------------------------------------------- /Website/public/img/earrings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/img/earrings.jpg -------------------------------------------------------------------------------- /Website/public/img/heart.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/img/heart.jpg -------------------------------------------------------------------------------- /Website/public/img/logo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/img/logo.gif -------------------------------------------------------------------------------- /Website/public/img/progress.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/img/progress.gif -------------------------------------------------------------------------------- /Website/public/img/ring.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/Website/public/img/ring.jpg -------------------------------------------------------------------------------- /Website/public/js/customize.js: -------------------------------------------------------------------------------- 1 | // JavaScript Document 2 | 3 | (function(R) { 4 | 5 | $("#submitBtn").click(function(){ 6 | exeScript() 7 | }); 8 | 9 | /* Execute the repository script to get score for this claim */ 10 | var exeScript = function() { 11 | 12 | /* callback configuration */ 13 | var callback = { 14 | scope : this, 15 | 16 | // success callback 17 | success : function(result) { 18 | //area.className = ''; 19 | var objs = result.deployr.response.workspace.objects; 20 | score = objs[0].value; 21 | 22 | // Use the score from the script to display the appropriate message 23 | if (score < 3) { 24 | $("#resultArea").html(' Thank you for submitting your claim. It has been fast tracked for processing.'); 25 | $("#resultArea").removeClass('alert-danger'); 26 | $("#resultArea").addClass('alert-success'); 27 | } else { 28 | $("#resultArea").html('Thank you for submitting your claim. Please allow 2-4 weeks for review.'); 29 | $("#resultArea").removeClass('alert-success'); 30 | $("#resultArea").addClass('alert-danger'); 31 | } 32 | 33 | $("#resultArea").fadeIn(); 34 | 35 | 36 | }, 37 | // failure callback 38 | failure : function(result) { 39 | var msg = result; 40 | 41 | if (result.deployr) { 42 | msg = result.deployr.response.error; 43 | $("#resultArea").html(msg); 44 | } 45 | } 46 | }; 47 | 48 | /* configuration input for repository script execution */ 49 | 50 | //inputList gathers up all the form values and formats them for DeployR 51 | var inputList = []; 52 | $(".form-control").each(function() { 53 | inputList.push(R.RDataFactory.createString($(this).attr("id"), $(this).val() || ' ')); 54 | }); 55 | 56 | //send all the form values as inputs, and retrieve 'score' from the script execution 57 | 58 | var scriptConfig = { 59 | filename : 'insuranceFraud', 60 | author : 'sheri', 61 | inputs : inputList, 62 | robjects: ['score'], 63 | preloadfilename: 'rtsScoreFraud.R', 64 | preloadfileauthor: 'sheri', 65 | blackbox: true 66 | }; 67 | 68 | // execute RScript 69 | R.DeployR.repositoryScriptExecute(scriptConfig, callback); 70 | }; 71 | })(window.Revolution); -------------------------------------------------------------------------------- /Website/public/js/scoreClaim.js: -------------------------------------------------------------------------------- 1 | // JavaScript Document 2 | var scoreClaim = function(id, amt){ 3 | //first get the rest of the data for this id 4 | record = lookupData(id, amt) 5 | // call /predict to get res.prob, the probability of returning the shipment 6 | $.ajax({ 7 | url: '/predict', 8 | type: 'GET', 9 | data: { record: record }, 10 | contentType:"application/json; charset=utf-8", 11 | error: function(xhr, error){ 12 | console.log(xhr); console.log(error); 13 | }, 14 | success: function(res) { 15 | console.log("AccountID: " + id + " transactionAmt: " + amt ) 16 | console.log("Predicted probability: " + res.pred ) 17 | // now use the probability to display one of two message 18 | if (res.pred > 0.5) { //problem with this order; 19 | $("#resultArea").html('There is a problem with this order. Please call 800-555-2222 for more information'); 20 | $("#resultArea").removeClass('alert-success'); 21 | $("#resultArea").addClass('alert-danger'); 22 | 23 | 24 | } else { // no problem with the order 25 | $("#resultArea").html('Thank you for submitting your order. You will receive an email with tracking information shortly.'); 26 | $("#resultArea").removeClass('alert-danger'); 27 | $("#resultArea").addClass('alert-success'); 28 | } 29 | // make sure result is visible 30 | $("#resultArea").removeClass('hide'); 31 | $("#resultArea").addClass('show'); 32 | // remove the "click here for status" section 33 | $("#status").removeClass('show'); 34 | $("#status").addClass('hide'); 35 | $("#resultArea").fadeIn(); 36 | } 37 | 38 | }); 39 | } 40 | 41 | var lookupData = function(custID, amt){ 42 | amt = parseFloat(amt.replace(/,/g, '')); 43 | // the rest of the record would be looked up in a customer database. 44 | // for this demo we are simply supplying that info directly for our four test accounts 45 | var custData; 46 | 47 | switch(custID) { 48 |     case 'A1055521358474530': 49 |         custData = 'USD,NULL,20130409,102958,14,A,P,NULL,NULL,NULL,92.97,dubayy,0,ae,FALSE,NULL,en-US,CREDITCARD,AMEX,NULL,NULL,NULL,33071,FL,US,NULL,NULL,NULL,NULL,NULL,NULL,M,NULL,0,4,NULL'; 50 |         break; 51 |     case 'A914800341525449': 52 |         custData = 'USD,NULL,20130409,122427,7,A,P,NULL,NULL,NULL,108.49,massachusetts,2118,us,FALSE,NULL,en-US,CREDITCARD,VISA,NULL,NULL,NULL,1702,MA,US,NULL,NULL,NULL,NULL,NULL,NULL,M,NULL,1,0,NULL'; 53 |         break; 54 | case 'A1688852355371910': 55 | custData = 'USD,NULL,20130409,110900,6,A,P,NULL,NULL,NULL,99.47,florida,32114,us,FALSE,NULL,en-US,CREDITCARD,VISA,NULL,NULL,NULL,32746,FL,US,NULL,NULL,NULL,NULL,NULL,NULL,M,NULL,1,0,NULL'; 56 | break; 57 | default: 58 | custData = 'USD,NULL,20130409,104848,NULL,A,P,NULL,NULL,NULL,121.242,maharashtra,411001,in,FALSE,NULL,en-US,CREDITCARD,VISA,NULL,NULL,NULL,98033,WA,US,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,3,0,NULL'; 59 | break; 60 | } 61 | 62 | var record = 'xxxTRANSID,'+ custID + ',' + amt + ',' + amt + ',' + custData; 63 | 64 | return(record); 65 | } 66 | -------------------------------------------------------------------------------- /Website/public/js/startUp.js: -------------------------------------------------------------------------------- 1 | // JavaScript Document 2 | 3 | $(document).ready ( function () { 4 | $("#resultArea").hide(); 5 | $("#status").hide(); 6 | // show the login dialog on startup 7 | $('#loginDlg').modal('show'); 8 | 9 | $('#selAccount').change(function() { 10 | $("#claimantID").val( ($(this).val()) ); 11 | }); 12 | 13 | $("#resetBtn").click(function(){ 14 | // empty the table and change the purchase back to 0. 15 | $("#myTable > tbody").html(""); 16 | $("#status").removeClass('show'); 17 | $("#status").addClass('hide'); 18 | // hide the result area 19 | $("#resultArea").addClass('hide'); 20 | $("#resultArea").removeClass('show'); 21 | document.getElementById('result').innerHTML = "Total Purchase: $0" 22 | $("#resultArea").fadeOut(); 23 | }); 24 | 25 | 26 | 27 | $("#submitBtn").click(function(){ 28 | acctID = $("#claimantID").val(); 29 | // check to make sure there is an item in the cart 30 | if ($('#myTable tr').length > 0 ) { 31 | // also make sure the account id is present. 32 | if (acctID !== '') { 33 | // hide the status message and call scoreClaim 34 | $("#status").removeClass('hide'); 35 | $("#status").addClass('show'); 36 | $("#status").fadeIn(); 37 | // clear the result area 38 | $("#resultArea").addClass('hide'); 39 | $("#resultArea").removeClass('show'); 40 | var amt = recalc(); 41 | scoreClaim( acctID, amt); 42 | } else { 43 | // no account ID present 44 | $("#status").removeClass('show'); 45 | $("#status").addClass('hide'); 46 | $("#resultArea").html('Please Login and try again.'); 47 | $("#resultArea").removeClass('alert-success'); 48 | $("#resultArea").addClass('alert-danger'); 49 | $("#resultArea").fadeIn(); 50 | } 51 | } else { 52 | // no items in the cart 53 | $("#status").removeClass('show'); 54 | $("#status").addClass('hide'); 55 | $("#resultArea").html('You must have at least one item before you can Purchase.'); 56 | $("#resultArea").removeClass('alert-success'); 57 | $("#resultArea").addClass('alert-danger'); 58 | $("#resultArea").fadeIn(); 59 | } 60 | }); 61 | 62 | 63 | $(".addItem").click (function(){ 64 | // Adding items to the cart - just hardcoding a few items here 65 | switch (this.id) { 66 | case "heart": 67 | contents = '' 68 | contents = contents + 'Black and White Diamond Heart' 69 | contents = contents + '$130' 70 | break; 71 | case "earrings": 72 | contents = '' 73 | contents = contents + 'Diamond Pave Earrings' 74 | contents = contents + '$569' 75 | break; 76 | case "bracelet": 77 | contents = '' 78 | contents = contents + 'Diamond Tennis Bracelet' 79 | contents = contents + '$360' 80 | break; 81 | case "ring": 82 | contents = '' 83 | contents = contents + 'Diamond Engagement Ring' 84 | contents = contents + '$2100' 85 | break; 86 | } 87 | contents = contents + '' 88 | $('#myTable > tbody:last-child').append(contents); 89 | recalc() 90 | }); 91 | 92 | // can't use $(".deleteMe").click here because the items are dynamically added, not all present at the start. 93 | $(document).on('click', 'button.deleteMe', function () { 94 | $(this).closest('tr').remove(); 95 | recalc(); 96 | }); 97 | 98 | function formatTotal(x) { 99 | x = Math.round(x); 100 | return x.toString().replace(/\B(?=(\d{3})+(?!\d))/g, ","); 101 | } 102 | 103 | function recalc(){ 104 | // iterate through all the values in the table (class="val") 105 | var resultVal = 0.0; 106 | $(".val").each ( function() { 107 | var itemval = $(this).text(); 108 | itemval = itemval.replace('', "0"); 109 | resultVal += parseFloat ( itemval.replace(/\s/g,'').replace(',','.')); 110 | }); 111 | resultVal = formatTotal(resultVal); 112 | document.getElementById('result').innerHTML = "Total Purchase $" + resultVal; 113 | return(resultVal) 114 | } 115 | 116 | }); 117 | -------------------------------------------------------------------------------- /Website/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Fraud Detection 4 | ## Implemented on SQL Server 2016 R Services and HDInsight Spark 5 | 6 | > Discover more examples at [Microsoft Machine Learning Server](https://github.com/Microsoft/ML-Server) 7 | 8 | 9 | Deploy this solution from Cortana Intelligence Gallery with [SQL Server](https://aka.ms/fraud-detection) or [HDInsight Spark Cluster](https://aka.ms/fraud-detection-hdi). 10 | 11 | For all documentation, visit the [Fraud Detection website](https://microsoft.github.io/r-server-fraud-detection/). 12 | 13 | **NOTE:** Please don't use "Download ZIP" to get this repository, as it will change the line endings in the data files. Use "git clone" to get a local copy of this repository. 14 | 15 | # Contributing 16 | 17 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 18 | -------------------------------------------------------------------------------- /Website/server.js: -------------------------------------------------------------------------------- 1 | var express = require('express'); 2 | var Connection = require('tedious').Connection; 3 | var Request = require('tedious').Request; 4 | var TYPES = require('tedious').TYPES; 5 | 6 | var fs = require('fs'); 7 | var util = require('util'); 8 | var logFileName = __dirname + '/debug.log'; 9 | 10 | var app = express(); 11 | var exphbs = require('express-handlebars'); 12 | app.engine('handlebars', exphbs({defaultLayout: 'main'})); 13 | app.set('view engine', 'handlebars'); 14 | 15 | app.use(express.static('public')); 16 | 17 | 18 | 19 | 20 | // 21 | // DB Connection 22 | // 23 | var args = process.argv.slice(2); 24 | if (args.length>0) { 25 | var user = args[0]; 26 | var pw = args[1]; 27 | } 28 | else { 29 | var user = 'XXYOURSQLUSER'; 30 | var pw = 'XXYOURSQLPW'; 31 | } 32 | 33 | 34 | var con = new Connection({ //fix this with fraud db info 35 | userName: user, 36 | password: pw, 37 | server: 'localhost', 38 | // When you connect to Azure SQL Database, you need encrypt: true 39 | options: { encrypt: true, database: 'Fraud_R' } 40 | }); 41 | 42 | con.on('connect', function(err) { 43 | console.log('DB Connection ' + (err ? '~~~ Failure ~~~' : 'Success')); 44 | if (err) console.log(err); 45 | }); 46 | 47 | // 48 | // Put your routes here 49 | // 50 | 51 | // Home Page 52 | app.get('/', function (req, res) { 53 | res.render('home') 54 | }); 55 | 56 | // Kill the server 57 | app.get('/kill', function (req, res) { 58 | setTimeout(() => process.exit(), 500); 59 | }); 60 | 61 | 62 | // predict function, called from scoreClaim.js 63 | 64 | app.get('/predict', function (req, res) { 65 | var request = new Request('ScoreOneTrans', function(err, rowCount) { 66 | if (err) { 67 | console.log(err); 68 | } 69 | // console.log("Rows Returned: " + rowCount ) 70 | }); 71 | 72 | var record = req.query.record; 73 | console.log (record) 74 | request.on('row', function(col) { 75 | if (col[0].value === null) { 76 | console.log('NULL'); 77 | } else { 78 | // values to return - the predicted probability 79 | value = col[0].value; 80 | } 81 | 82 | res.json({ pred: value }); 83 | request.on('doneInProc', function(rowCount, more) { 84 | console.log(rowCount + ' rows returned'); 85 | }); 86 | 87 | }); 88 | // pass the entire record to the stored procedure 89 | request.addParameter('inputstring', TYPES.VarChar, record); 90 | con.callProcedure(request); 91 | con.close; 92 | 93 | 94 | }); 95 | 96 | //log to file 97 | var logFile = fs.createWriteStream(logFileName, { flags: 'a' }); 98 | var logProxy = console.log; 99 | console.log = function (d) { // 100 | logFile.write(util.format(new Date() + ": " + d || '') + '\r\n'); 101 | logProxy.apply(this, arguments); 102 | }; 103 | 104 | app.listen(3000, function () { 105 | console.log('Example app listening on port 3000!'); 106 | }); -------------------------------------------------------------------------------- /Website/startMe.vbs: -------------------------------------------------------------------------------- 1 | user = WScript.Arguments.Item(0) 2 | pw = WScript.Arguments.Item(1) 3 | 4 | cmd = "node server.js " + user + " " + pw 5 | 6 | CreateObject("Wscript.Shell").Run cmd, 0 -------------------------------------------------------------------------------- /Website/views/home.handlebars: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 |

5 | 6 | 7 |
8 |
9 |

Your Cart

10 |
11 |
12 | 13 | 14 | 15 | 16 |
17 | 18 | 20 |
21 |
22 | 23 | 24 |
25 |
26 | 27 |
28 |
29 | 30 |
31 |
32 | 33 |
34 | 35 |
Total Purchase: $0
36 |

 

37 | 38 | 39 |

 

40 |
41 |
Processing your order...
42 |
43 | 44 |
45 |
46 |
47 | 48 | 49 | 50 |
51 |
52 | 53 | 54 | 55 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 183 | 184 | -------------------------------------------------------------------------------- /Website/views/layouts/main.handlebars: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Joseph's Mart 10 | 11 | 12 | 13 | 14 | 15 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 37 | 38 | 39 | 40 |
41 | 42 | 43 | 60 | 61 | 62 | 63 | {{{body}}} 64 | 65 | 66 | 67 | 68 |
69 |
70 |

71 |
72 |

Demo website for a fake company· Privacy · Terms

73 |
74 |
75 |
76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /onlinefraud.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/onlinefraud.pbix -------------------------------------------------------------------------------- /onlinefraudHDI.pbix: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/r-server-fraud-detection/4ff1bc6740a9384d80dbf3cc2c97e374a47248af/onlinefraudHDI.pbix --------------------------------------------------------------------------------