├── .gitignore ├── Code Examples ├── KNN │ ├── .idea │ │ ├── .name │ │ ├── compiler.xml │ │ ├── copyright │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── libraries │ │ │ ├── Maven__com_github_haifengl_smile_core_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_data_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_math_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ │ └── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── scala_compiler.xml │ │ ├── scopes │ │ │ └── scope_settings.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── KNN.iml │ ├── KNN_Example_1.csv │ ├── pom.xml │ ├── projectFilesBackup │ │ └── KNN.iml │ └── src │ │ └── main │ │ └── java │ │ └── KNNExample.scala ├── LinearRegression │ ├── .idea │ │ ├── .name │ │ ├── compiler.xml │ │ ├── copyright │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── libraries │ │ │ ├── Maven__com_github_haifengl_smile_core_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_data_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_math_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ │ ├── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ │ └── smile_1_0.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── scala_compiler.xml │ │ ├── scopes │ │ │ └── scope_settings.xml │ │ ├── uiDesigner.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── LinearRegression.iml │ ├── data │ │ └── OLS_Regression_Example_3.csv │ ├── pom.xml │ ├── projectFilesBackup │ │ └── LinearRegression.iml │ └── src │ │ └── main │ │ └── java │ │ └── LinearRegression.scala ├── NaiveBayes │ ├── .idea │ │ ├── .name │ │ ├── compiler.xml │ │ ├── copyright │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── libraries │ │ │ ├── Maven__com_github_haifengl_smile_core_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_data_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_math_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ │ ├── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ │ ├── scala_compiler.xml │ │ │ └── smile_1_0.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── scala_compiler.xml │ │ ├── scopes │ │ │ └── scope_settings.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── NaiveBayes.iml │ ├── data │ │ └── stopwords.txt │ ├── pom.xml │ ├── projectFilesBackup │ │ └── NaiveBayes.iml │ └── src │ │ └── main │ │ └── java │ │ ├── NaiveBayesExample.scala │ │ └── TDM.scala ├── PCA │ ├── .idea │ │ ├── .name │ │ ├── compiler.xml │ │ ├── copyright │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── highlighting.xml │ │ ├── libraries │ │ │ ├── Maven__com_github_haifengl_smile_core_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_data_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_math_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ │ ├── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ │ └── smile_1_0.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── scala_compiler.xml │ │ ├── scopes │ │ │ └── scope_settings.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── PCA.iml │ ├── data │ │ ├── PCA_Example_1.csv │ │ └── PCA_Example_2.csv │ ├── pom.xml │ ├── projectFilesBackup │ │ └── PCA.iml │ └── src │ │ └── main │ │ └── java │ │ └── PCA.scala ├── RecommendationSystem │ ├── .idea │ │ ├── .name │ │ ├── compiler.xml │ │ ├── copyright │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── libraries │ │ │ ├── Maven__com_github_haifengl_smile_core_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_data_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_math_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ │ ├── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ │ ├── joda_time_joda_time_2_2.xml │ │ │ └── smile_1_0.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── scala_compiler.xml │ │ ├── scopes │ │ │ └── scope_settings.xml │ │ ├── uiDesigner.xml │ │ └── vcs.xml │ ├── RecommendationSystem.iml │ ├── data │ │ └── stopwords.txt │ ├── pom.xml │ ├── projectFilesBackup │ │ └── RecommendationSystem.iml │ └── src │ │ └── main │ │ └── java │ │ └── RecommendationSystem.scala ├── SVM │ ├── .idea │ │ ├── .name │ │ ├── compiler.xml │ │ ├── copyright │ │ │ └── profiles_settings.xml │ │ ├── encodings.xml │ │ ├── libraries │ │ │ ├── Maven__com_github_haifengl_smile_core_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_data_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_math_1_0_3.xml │ │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ │ └── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── scala_compiler.xml │ │ ├── scopes │ │ │ └── scope_settings.xml │ │ ├── uiDesigner.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── SVM.iml │ ├── data │ │ ├── SVM_Example_1.csv │ │ ├── SVM_Example_2.csv │ │ └── SVM_Example_2_Test_data.csv │ ├── pom.xml │ ├── projectFilesBackup │ │ └── SVM.iml │ └── src │ │ └── main │ │ └── java │ │ ├── SVM_Example_2.scala │ │ └── SupportVectorMachine.scala └── TextRegression │ ├── .idea │ ├── .name │ ├── compiler.xml │ ├── copyright │ │ └── profiles_settings.xml │ ├── encodings.xml │ ├── libraries │ │ ├── Maven__com_github_haifengl_smile_core_1_0_2.xml │ │ ├── Maven__com_github_haifengl_smile_data_1_0_1.xml │ │ ├── Maven__com_github_haifengl_smile_graph_1_0_1.xml │ │ ├── Maven__com_github_haifengl_smile_math_1_0_2.xml │ │ ├── Maven__com_github_haifengl_smile_plot_1_0_2.xml │ │ ├── Maven__com_github_tototoshi_scala_csv_2_11_1_2_1.xml │ │ ├── Maven__org_scala_lang_scala_library_2_11_6.xml │ │ ├── Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml │ │ ├── com_github_tototoshi_scala_csv_2_11_1_2_0.xml │ │ └── smile_1_0.xml │ ├── misc.xml │ ├── modules.xml │ ├── scala_compiler.xml │ ├── scopes │ │ └── scope_settings.xml │ ├── uiDesigner.xml │ ├── vcs.xml │ └── workspace.xml │ ├── TextRegression.iml │ ├── data │ ├── TextRegression_Example_1.csv │ └── stopwords.txt │ ├── pom.xml │ ├── projectFilesBackup │ └── TextRegression.iml │ └── src │ └── main │ └── java │ ├── DTM.scala │ └── TextRegression.scala ├── Example Data ├── KNN_Example_1.csv ├── OLS_Regression_Example_3.csv ├── PCA_Example_1.csv ├── PCA_Example_2.csv ├── Recommendation_Example_1.zip ├── SVM_Example_1.csv ├── SVM_Example_2.csv ├── SVM_Example_2_Test_data.csv ├── TextRegression_Example_1.csv └── stopwords.txt ├── Explanatory Graphs └── ML Blog.graffle ├── Images ├── DynamicMachineLearning.png ├── Formula1.png ├── Formula2.png ├── Formula3.png ├── Formula4.png ├── Good_Fit.png ├── Ham_No_Stopwords.png ├── Ham_Stopwords.png ├── HumanDataPoints.png ├── KNNPlot.png ├── Mail_per_Sender_Distribution.png ├── Mail_per_Sender_log_Distribution.png ├── Mail_per_Subject_Distribution.png ├── Mail_per_Subject_log_Distribution.png ├── MaleFemalePlot.png ├── OverFitting.png ├── PCA_Explanatory_Data.png ├── PCA_Normalised.png ├── PCA_Reduced_Dimension.png ├── Precision.png ├── PrecisionFull.png ├── PrecisionHalf.png ├── Recall.png ├── RecallFull.png ├── RecallHalf.png ├── SVM_Datapoints.png ├── SVM_TestData.png ├── SVM_TrainData.png ├── Spam_No_Stopwords.png ├── Spam_Stopwords.png ├── Under-fitting.png ├── Unscaled_DJI_PCA_Index.png ├── Unscaled_PCA_Index.png └── Weighted_Subject_Distribution.png └── Readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | *.class 3 | 4 | Code/Scala/BinaryClassification/.idea/workspace.xml 5 | 6 | Code Examples/KNN/.idea/workspace.xml 7 | 8 | Code Examples/RecommendationSystem/.idea/workspace.xml 9 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/.name: -------------------------------------------------------------------------------- 1 | KNN -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/KNN/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/KNN/KNN.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Code Examples/KNN/KNN_Example_1.csv: -------------------------------------------------------------------------------- 1 | "X","Y","Label" 2 | 2.37354618925767,5.39810588036707,0 3 | 3.18364332422208,4.38797360674923,0 4 | 2.16437138758995,5.34111969142442,0 5 | 4.59528080213779,3.87063690391921,0 6 | 3.32950777181536,6.43302370170104,0 7 | 2.17953161588198,6.98039989850586,0 8 | 3.48742905242849,4.63277852353349,0 9 | 3.73832470512922,3.95586537368347,0 10 | 3.57578135165349,5.56971962744241,0 11 | 2.69461161284364,4.86494539611918,0 12 | 4.51178116845085,7.40161776050478,0 13 | 3.38984323641143,4.96075999726683,0 14 | 2.3787594194582,5.68973936245078,0 15 | 0.7853001128225,5.02800215878067,0 16 | 4.12493091814311,4.25672679111759,0 17 | 2.95506639098477,5.18879229951434,0 18 | 2.98380973690105,3.19504137110896,0 19 | 3.9438362106853,6.46555486156289,0 20 | 3.82122119509809,5.1532533382119,0 21 | 3.59390132121751,7.17261167036215,0 22 | 3.91897737160822,5.47550952889966,0 23 | 3.78213630073107,4.29005356907819,0 24 | 3.07456498336519,5.61072635348905,0 25 | 1.01064830413663,4.06590236835575,0 26 | 3.61982574789471,3.7463665997609,0 27 | 2.943871260471,5.29144623551746,0 28 | 2.84420449329467,4.55670812678157,0 29 | 1.52924761610073,5.00110535163162,0 30 | 2.52184994489138,5.07434132415166,0 31 | 3.4179415601997,4.41047905381193,0 32 | 4.35867955152904,4.4313312671815,0 33 | 2.897212272657,4.86482138487617,0 34 | 3.38767161155937,6.1780869965732,0 35 | 2.94619495941709,3.47643319957024,0 36 | 1.62294044317139,5.59394618762842,0 37 | 2.58500543670032,5.33295037121352,0 38 | 2.60571004628965,6.06309983727636,0 39 | 2.94068660328881,4.6958160763657,0 40 | 4.10002537198388,5.37001880991629,0 41 | 3.76317574845754,5.26709879077223,0 42 | 2.83547640374641,4.45747996900835,0 43 | 2.74663831986349,6.20786780598317,0 44 | 3.69696337540474,6.16040261569495,0 45 | 3.55666319867366,5.700213649515,0 46 | 2.31124430545048,6.58683345454085,0 47 | 2.29250484303788,5.5584864255653,0 48 | 3.36458196213683,3.72340779154196,0 49 | 3.76853292451542,4.42673458576311,0 50 | 2.88765378784977,3.77538738510164,0 51 | 3.88110772645421,4.52659936356069,0 52 | 4.37963332277588,7.45018710127266,1 53 | 5.04211587314424,6.98144016728536,1 54 | 4.08907835144755,6.68193162545616,1 55 | 5.15802877240407,6.0706378525463,1 56 | 4.34541535608118,5.51253968985852,1 57 | 6.76728726937265,5.92480770338432,1 58 | 5.71670747601721,8.00002880371391,1 59 | 5.91017422949523,6.37873330520318,1 60 | 5.38418535782634,5.61557315261551,1 61 | 6.68217608051942,8.86929062242358,1 62 | 4.36426354605102,7.42510037737245,1 63 | 4.53835526963943,6.76135289908697,1 64 | 6.43228223854166,8.05848304870902,1 65 | 4.34930364668963,7.88642265137494,1 66 | 4.79261925639803,6.38075695176885,1 67 | 4.60719207055802,9.20610246454047,1 68 | 4.68000713145149,6.74497296985898,1 69 | 4.72088669702344,5.57550534978719,1 70 | 5.49418833126783,6.85560039804578,1 71 | 4.82266951773039,7.20753833923234,1 72 | 4.49404253788574,9.30797839905936,1 73 | 6.34303882517041,7.10580236789371,1 74 | 4.78542059145313,7.45699880542341,1 75 | 4.82044346995661,6.92284706464347,1 76 | 4.89980925878644,6.66599915763346,1 77 | 5.71266630705141,6.96527397168872,1 78 | 4.92643559587367,7.78763960563016,1 79 | 4.96236582853295,9.07524500865228,1 80 | 4.31833952124434,8.02739243876377,1 81 | 4.67572972775368,8.2079083983867,1 82 | 5.06016044043452,5.76867657844196,1 83 | 4.41110551374034,7.98389557005338,1 84 | 5.53149619263257,7.21992480366065,1 85 | 3.48160591821321,5.53274997090776,1 86 | 5.30655786078977,7.52102274264814,1 87 | 3.46355017646241,6.84124539528398,1 88 | 4.69902387316339,8.4645873119698,1 89 | 4.47172009555499,6.23391800039534,1 90 | 4.347905219319,6.56978824607145,1 91 | 4.94310322215261,6.07389050262256,1 92 | 3.08564057431999,6.82289603856346,1 93 | 6.17658331201856,7.40201177948634,1 94 | 3.335027563788,6.26825182688039,1 95 | 4.53646959852761,7.83037316798167,1 96 | 3.88407989495715,5.79191721369553,1 97 | 4.24918099880655,5.95201558719226,1 98 | 7.08716654562835,8.44115770684428,1 99 | 5.01739561969325,5.98415253469535,1 100 | 3.71369946956567,7.41197471231752,1 101 | 3.35939446558142,6.61892394889108,1 102 | -------------------------------------------------------------------------------- /Code Examples/KNN/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Xyclade.ML 8 | KNN 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | 14 | com.github.haifengl 15 | smile-core 16 | 1.0.3 17 | 18 | 19 | com.github.haifengl 20 | smile-plot 21 | 1.0.2 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/KNN/projectFilesBackup/KNN.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Code Examples/KNN/src/main/java/KNNExample.scala: -------------------------------------------------------------------------------- 1 | import java.awt.Color 2 | import java.io.File 3 | import smile.classification.KNN 4 | import smile.plot._ 5 | import smile.validation._ 6 | import scala.swing._ 7 | 8 | object KNNExample extends SimpleSwingApplication { 9 | def top = new MainFrame { 10 | title = "KNN Example from http://xyclade.ml" 11 | val basePath = "KNN_Example_1.csv" 12 | 13 | try { 14 | 15 | val testData = GetDataFromCSV(new File(basePath)) 16 | 17 | val plot = ScatterPlot.plot(testData._1, testData._2, '@', Array(Color.red, Color.blue)) 18 | peer.setContentPane(plot) 19 | size = new Dimension(400, 400) 20 | 21 | 22 | //Define the amount of rounds, in our case 2 and initialise the cross validation 23 | val validationRounds = 2 24 | 25 | val cv = new CrossValidation(testData._2.length, validationRounds) 26 | 27 | val testDataWithIndices = (testData._1.zipWithIndex, testData._2.zipWithIndex) 28 | 29 | val trainingDPSets = cv.train 30 | .map(indexList => indexList 31 | .map(index => testDataWithIndices 32 | ._1.collectFirst { case (dp, `index`) => dp }.get)) 33 | 34 | val trainingClassifierSets = cv.train 35 | .map(indexList => indexList 36 | .map(index => testDataWithIndices 37 | ._2.collectFirst { case (dp, `index`) => dp }.get)) 38 | 39 | val testingDPSets = cv.test 40 | .map(indexList => indexList 41 | .map(index => testDataWithIndices 42 | ._1.collectFirst { case (dp, `index`) => dp }.get)) 43 | 44 | val testingClassifierSets = cv.test 45 | .map(indexList => indexList 46 | .map(index => testDataWithIndices 47 | ._2.collectFirst { case (dp, `index`) => dp }.get)) 48 | 49 | 50 | val validationRoundRecords = trainingDPSets 51 | .zipWithIndex.map(x => (x._1, trainingClassifierSets(x._2), testingDPSets(x._2), testingClassifierSets(x._2))) 52 | 53 | validationRoundRecords.foreach { record => 54 | 55 | val knn = KNN.learn(record._1, record._2, 3) 56 | 57 | //And for each test data point make a prediction with the model 58 | val predictions = record._3.map(x => knn.predict(x)).zipWithIndex 59 | 60 | //Finally evaluate the predictions as correct or incorrect and count the amount of wrongly classified data points. 61 | val error : Double = predictions.map(x => if (x._1 != record._4(x._2)) 1 else 0).sum 62 | 63 | println("False prediction rate: " + error / predictions.length * 100 + "%") 64 | 65 | 66 | val unknownDataPoint = Array(5.3, 4.3) 67 | 68 | val result = knn.predict(unknownDataPoint) 69 | if (result == 0) { 70 | println("Internet Service Provider Alpha") 71 | } 72 | else if (result == 1) { 73 | println("Internet Service Provider Beta") 74 | } 75 | else { 76 | println("Unexpected prediction") 77 | } 78 | } 79 | } 80 | catch { 81 | case e: Exception => println("You probably are missing the KNN sample file, or did not set the path correctly. Check the exception for more details: " + e); 82 | } 83 | } 84 | 85 | 86 | def GetDataFromCSV(file: File): (Array[Array[Double]], Array[Int]) = { 87 | 88 | val source = scala.io.Source.fromFile(file) 89 | val data = source.getLines().drop(1).map(x => GetDataFromString(x)).toArray 90 | source.close() 91 | val dataPoints = data.map(x => x._1) 92 | val classifierArray = data.map(x => x._2) 93 | (dataPoints, classifierArray) 94 | } 95 | 96 | def GetDataFromString(dataString: String): (Array[Double], Int) = { 97 | 98 | //Split the comma separated value string into an array of strings 99 | val dataArray: Array[String] = dataString.split(',') 100 | 101 | //Extract the values from the strings 102 | val xCoordinate: Double = dataArray(0).toDouble 103 | val yCoordinate: Double = dataArray(1).toDouble 104 | val classifier: Int = dataArray(2).toInt 105 | 106 | //And return the result in a format that can later easily be used to feed to Smile 107 | (Array(xCoordinate, yCoordinate), classifier) 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/.name: -------------------------------------------------------------------------------- 1 | LinearRegressionExample -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/libraries/smile_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/LinearRegression.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Xyclade.ml 8 | LinearRegression 9 | 1.0-SNAPSHOT 10 | 11 | 12 | com.github.haifengl 13 | smile-core 14 | 1.0.3 15 | 16 | 17 | com.github.haifengl 18 | smile-plot 19 | 1.0.2 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/projectFilesBackup/LinearRegression.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Code Examples/LinearRegression/src/main/java/LinearRegression.scala: -------------------------------------------------------------------------------- 1 | import java.awt.Color 2 | import java.io.File 3 | 4 | import smile.plot._ 5 | import smile.regression._ 6 | import scala.swing._ 7 | 8 | 9 | object LinearRegression extends SimpleSwingApplication { 10 | def top = new MainFrame { 11 | title = "Linear regression Example from http://Xyclade.ml" 12 | val basePath = "data/OLS_Regression_Example_3.csv" 13 | 14 | val test_data = GetDataFromCSV(new File(basePath)) 15 | 16 | val plotData = (test_data._1 zip test_data._2).map(x => Array(x._1(1) ,x._2)) 17 | val maleFemaleLabels = test_data._1.map( x=> x(0).toInt) 18 | val plot = ScatterPlot.plot(plotData,maleFemaleLabels,'@',Array(Color.blue, Color.green)) 19 | plot.setTitle("Weight and heights for males and females") 20 | plot.setAxisLabel(0,"Heights") 21 | plot.setAxisLabel(1,"Weights") 22 | 23 | 24 | 25 | peer.setContentPane(plot) 26 | size = new Dimension(400, 400) 27 | 28 | val olsModel = new OLS(test_data._1,test_data._2) 29 | 30 | println("Prediction for Male of 1.7M: " +olsModel.predict(Array(0.0,170.0))) 31 | println("Prediction for Female of 1.7M:" + olsModel.predict(Array(1.0,170.0))) 32 | 33 | println("Model Error:" + olsModel.error()) 34 | println("Accuracy of the model: " + olsModel.RSquared() * 100 + "%") 35 | } 36 | 37 | def GetDataFromCSV(file: File): (Array[Array[Double]], Array[Double]) = { 38 | val source = scala.io.Source.fromFile(file) 39 | val data = source.getLines().drop(1).map(x => GetDataFromString(x)).toArray 40 | source.close() 41 | var inputData = data.map(x => x._1) 42 | var resultData = data.map(x => x._2) 43 | 44 | (inputData,resultData) 45 | } 46 | 47 | def GetDataFromString(dataString: String): (Array[Double], Double) = { 48 | 49 | //Split the comma separated value string into an array of strings 50 | val dataArray: Array[String] = dataString.split(',') 51 | var person = 1.0 52 | 53 | if (dataArray(0) == "\"Male\"") { 54 | person = 0.0 55 | } 56 | 57 | //Extract the values from the strings 58 | //Since the data is in US metrics (inch and pounds we will recalculate this to cm and kilo's) 59 | val data : Array[Double] = Array(person,dataArray(1).toDouble * 2.54) 60 | val weight: Double = dataArray(2).toDouble * 0.45359237 61 | 62 | //And return the result in a format that can later easily be used to feed to Smile 63 | (data, weight) 64 | } 65 | } -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/.name: -------------------------------------------------------------------------------- 1 | NaiveBayes -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/libraries/smile_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/NaiveBayes.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | would 55 | should 56 | could 57 | ought 58 | im 59 | youre 60 | hes 61 | shes 62 | were 63 | theyre 64 | ive 65 | youve 66 | weve 67 | theyve 68 | id 69 | youd 70 | hed 71 | shed 72 | wed 73 | theyd 74 | ill 75 | youll 76 | hell 77 | shell 78 | well 79 | theyll 80 | isnt 81 | arent 82 | wasnt 83 | werent 84 | hasnt 85 | havent 86 | hadnt 87 | doesnt 88 | dont 89 | didnt 90 | wont 91 | wouldnt 92 | shant 93 | shouldnt 94 | cant 95 | cannot 96 | couldnt 97 | mustnt 98 | lets 99 | thats 100 | whos 101 | whats 102 | heres 103 | theres 104 | whens 105 | wheres 106 | whys 107 | hows 108 | a 109 | an 110 | the 111 | and 112 | but 113 | if 114 | or 115 | because 116 | as 117 | until 118 | while 119 | of 120 | at 121 | by 122 | for 123 | with 124 | about 125 | against 126 | between 127 | into 128 | through 129 | during 130 | before 131 | after 132 | above 133 | below 134 | to 135 | from 136 | up 137 | down 138 | in 139 | out 140 | on 141 | off 142 | over 143 | under 144 | again 145 | further 146 | then 147 | once 148 | here 149 | there 150 | when 151 | where 152 | why 153 | how 154 | all 155 | any 156 | both 157 | each 158 | few 159 | more 160 | most 161 | other 162 | some 163 | such 164 | no 165 | nor 166 | not 167 | only 168 | own 169 | same 170 | so 171 | than 172 | too 173 | very 174 | tr 175 | td 176 | 177 | 178 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | groupId 8 | NaiveBayes 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | com.github.haifengl 14 | smile-core 15 | 1.0.3 16 | 17 | 18 | com.github.haifengl 19 | smile-plot 20 | 1.0.2 21 | 22 | 23 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/projectFilesBackup/NaiveBayes.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/src/main/java/NaiveBayesExample.scala: -------------------------------------------------------------------------------- 1 | import java.io.File 2 | import smile.classification.NaiveBayes 3 | import smile.feature.Bag 4 | 5 | object NaiveBayesExample { 6 | 7 | 8 | def main(args: Array[String]): Unit = { 9 | val basePath = "data/" 10 | val spamPath = basePath + "/spam" 11 | val easyHamPath = basePath + "/easy_ham" 12 | val easyHam2Path = basePath + "/easy_ham_2" 13 | 14 | val amountOfSamplesPerSet = 500 15 | val amountOfFeaturesToTake = 400 16 | 17 | try { 18 | //First get a subset of the file names for the spam sample set (500 is the complete set in this case) 19 | val listOfSpamFiles = getFilesFromDir(spamPath).take(amountOfSamplesPerSet) 20 | //Then get the messages that are contained in these files 21 | val spamMails = listOfSpamFiles.map(x => (x, getMessage(x))) 22 | 23 | val stopWords = getStopWords 24 | val spamTDM = spamMails 25 | .flatMap(email => email 26 | ._2.split(" ") 27 | .filter(word => word.nonEmpty && !stopWords.contains(word)) 28 | .map(word => (email._1.getName, word))) 29 | .groupBy(x => x._2) 30 | .map(x => (x._1, x._2.groupBy(x => x._1))) 31 | .map(x => (x._1, x._2.map(y => (y._1, y._2.length)))).toList 32 | 33 | //Sort the words by occurrence rate descending (amount of times the word occurs among all documents) 34 | val sortedSpamTDM = spamTDM.sortBy(x => -(x._2.size.toDouble / spamMails.length)) 35 | val spamFeatures = sortedSpamTDM.take(amountOfFeaturesToTake).map(x => x._1) 36 | 37 | //Get a subset of the file names from the ham sample set (note that in this case it is not necessary to randomly sample as the emails are already randomly ordered) 38 | val listOfHamFiles = getFilesFromDir(easyHamPath).take(amountOfSamplesPerSet) 39 | 40 | //Get the messages that are contained in the ham files 41 | val hamMails = listOfHamFiles.map(x => (x, getMessage(x))) 42 | //Then its time for feature selection specifically for the Ham messages 43 | val hamTDM = hamMails 44 | .flatMap(email => email 45 | ._2.split(" ") 46 | .filter(word => word.nonEmpty && !stopWords.contains(word)) 47 | .map(word => (email._1.getName, word))) 48 | .groupBy(x => x._2) 49 | .map(x => (x._1, x._2.groupBy(x => x._1))) 50 | .map(x => (x._1, x._2.map(y => (y._1, y._2.length)))).toList 51 | 52 | //Sort the words by occurrence rate descending (amount of times the word occurs among all documents) 53 | val sortedHamTDM = hamTDM.sortBy(x => -(x._2.size.toDouble / spamMails.length)) 54 | val hamFeatures = sortedHamTDM.take(amountOfFeaturesToTake).map(x => x._1) 55 | 56 | //Now we have a set of ham and spam features, we group them and then remove the intersecting features, as these are noise. 57 | var data = (hamFeatures ++ spamFeatures).toSet 58 | hamFeatures.intersect(spamFeatures).foreach(x => data = data - x) 59 | 60 | 61 | //Initialize a bag of words that takes the top x features from both spam and ham and combines them 62 | val bag = new Bag[String](data.toArray) 63 | 64 | //Initialize the classifier array with first a set of 0(spam) and then a set of 1(ham) values that represent the emails 65 | val classifiers = Array.fill[Int](amountOfSamplesPerSet)(0) ++ Array.fill[Int](amountOfSamplesPerSet)(1) 66 | 67 | //Get the trainingData in the right format for the spam mails 68 | val spamData = spamMails.map(x => bag.feature(x._2.split(" "))).toArray 69 | 70 | //Get the trainingData in the right format for the ham mails 71 | val hamData = hamMails.map(x => bag.feature(x._2.split(" "))).toArray 72 | 73 | //Combine the training data from both categories 74 | val trainingData = spamData ++ hamData 75 | 76 | //Create the bayes model as a multinomial with 2 classification groups and the amount of features passed in the constructor. 77 | val bayes = new NaiveBayes(NaiveBayes.Model.MULTINOMIAL, 2, data.size) 78 | //Now train the bayes instance with the training data, which is represented in a specific format due to the bag.feature method, and the known classifiers. 79 | bayes.learn(trainingData, classifiers) 80 | 81 | 82 | 83 | //Now we are ready for evaluation, for this we will use the testing sets: 84 | val listOfSpam2Files = getFilesFromDir(easyHam2Path) 85 | //Then get the messages that are contained in these files 86 | val spam2Mails = listOfSpam2Files.map { x => (x, getMessage(x)) } 87 | 88 | val spam2FeatureVectors = spam2Mails.map(x => bag.feature(x._2.split(" "))) 89 | 90 | val spam2ClassificationResults = spam2FeatureVectors.map(x => bayes.predict(x)) 91 | 92 | val spamClassifications = spam2ClassificationResults.count(x => x == 0) 93 | println(spamClassifications + " of " + listOfSpam2Files.length + " were classified as spam") 94 | println(((spamClassifications.toDouble / listOfSpam2Files.length) * 100) + "% was classified as spam") 95 | 96 | val hamClassifications = spam2ClassificationResults.count(x => x == 1) 97 | println(hamClassifications + " of " + listOfSpam2Files.length + " were classified as ham") 98 | println(((hamClassifications.toDouble / listOfSpam2Files.length) * 100) + "% was classified as ham") 99 | 100 | val unknownClassifications = spam2ClassificationResults.count(x => x == -1) 101 | println(unknownClassifications + " of " + listOfSpam2Files.length + " were unknowingly classified") 102 | println(((unknownClassifications.toDouble / listOfSpam2Files.length) * 100) + "% was unknowingly classified") 103 | } 104 | catch { 105 | case e: Exception => println("You probably are missing the sample data. You can download these from the spamassasin corpus (mentioned in the example on http://xyclade.ml) and place them in the directory 'data' in this project. Check the exception for more details: " + e); 106 | } 107 | 108 | } 109 | 110 | def getFilesFromDir(path: String): List[File] = { 111 | val d = new File(path) 112 | if (d.exists && d.isDirectory) { 113 | //Remove the mac os basic storage file, and alternatively for unix systems "cmds" 114 | d.listFiles.filter(x => x.isFile && !x.toString.contains(".DS_Store") && !x.toString.contains("cmds")).toList 115 | } else { 116 | List[File]() 117 | } 118 | } 119 | 120 | def getStopWords: List[String] = { 121 | val source = scala.io.Source.fromFile(new File("data/stopwords.txt"))("latin1") 122 | val lines = source.mkString.split("\n") 123 | source.close() 124 | lines.toList 125 | } 126 | 127 | def getMessage(file: File): String = { 128 | //Note that the encoding of the example files is latin1, thus this should be passed to the from file method. 129 | val source = scala.io.Source.fromFile(file)("latin1") 130 | val lines = source.getLines mkString "\n" 131 | source.close() 132 | //Find the first line break in the email, as this indicates the message body 133 | val firstLineBreak = lines.indexOf("\n\n") 134 | //Return the message body filtered by only text from a-z and to lower case 135 | lines.substring(firstLineBreak).replace("\n", " ").replaceAll("[^a-zA-Z ]", "").toLowerCase 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /Code Examples/NaiveBayes/src/main/java/TDM.scala: -------------------------------------------------------------------------------- 1 | import scala.collection.mutable 2 | 3 | class TDM { 4 | 5 | var records : List[TDMRecord] = List[TDMRecord]() 6 | 7 | def addTermToRecord(term : String, documentName : String) = 8 | { 9 | //Find a record for the term 10 | val record = records.find( x => x.term == term) 11 | if (record.nonEmpty) 12 | { 13 | val termRecord = record.get 14 | val documentRecord = termRecord.occurrences.find(x => x._1 == documentName) 15 | if (documentRecord.nonEmpty) 16 | { 17 | termRecord.occurrences += documentName -> (documentRecord.get._2 + 1) 18 | } 19 | else 20 | { 21 | termRecord.occurrences += documentName -> 1 22 | } 23 | } 24 | else 25 | { 26 | //No record yet exists for this term 27 | val newRecord = new TDMRecord(term, mutable.HashMap[String,Int](documentName -> 1)) 28 | records = newRecord :: records 29 | } 30 | } 31 | def SortByTotalFrequency() = records = records.sortBy( x => -x.totalFrequency) 32 | def SortByOccurrenceRate(rate : Int) = records = records.sortBy( x => -x.occurrenceRate(rate)) 33 | } 34 | 35 | class TDMRecord(val term : String, var occurrences : mutable.HashMap[String,Int] ) 36 | { 37 | def totalFrequency = occurrences.map(y => y._2).fold(0){ (z, i) => z + i} 38 | def occurrenceRate(totalDocuments : Int) : Double = occurrences.size.toDouble / totalDocuments 39 | def densityRate(totalTerms : Int) : Double = totalFrequency.toDouble / totalTerms 40 | } -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/.name: -------------------------------------------------------------------------------- 1 | PCA -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/highlighting.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/libraries/smile_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/PCA/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/PCA/PCA.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/PCA/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Xyclade.ml 8 | PCA 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | com.github.haifengl 14 | smile-core 15 | 1.0.3 16 | 17 | 18 | com.github.haifengl 19 | smile-plot 20 | 1.0.2 21 | 22 | 23 | -------------------------------------------------------------------------------- /Code Examples/PCA/projectFilesBackup/PCA.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/PCA/src/main/java/PCA.scala: -------------------------------------------------------------------------------- 1 | import java.awt.{Dimension, Color} 2 | import java.io.{PrintWriter, File} 3 | import java.text.{DecimalFormat, DateFormat, SimpleDateFormat} 4 | import java.util.{Locale, Date} 5 | 6 | import smile.math.distance.CorrelationDistance 7 | import smile.plot.{PlotCanvas, LinePlot, ScatterPlot, Line} 8 | import smile.projection.PCA 9 | 10 | import scala.swing.{MainFrame, SimpleSwingApplication} 11 | import scala.util.Random 12 | 13 | 14 | object PCA extends SimpleSwingApplication{ 15 | 16 | 17 | def top = new MainFrame { 18 | 19 | title = "PCA Example from http://xyclade.ml" 20 | //Get the example data 21 | val basePath = "data/" 22 | val exampleDataPath = basePath + "PCA_Example_1.csv" 23 | val trainData = GetStockDataFromCSV(new File(exampleDataPath)) 24 | 25 | val pca = new PCA(trainData._2) 26 | 27 | //We want to merge into 1 feature 28 | pca.setProjection(1) 29 | val points = pca.project(trainData._2) 30 | 31 | val maxDataValue = points.maxBy(x => x(0)) 32 | val minDataValue = points.minBy(x => x(0)) 33 | val rangeValue = maxDataValue(0) - minDataValue(0) 34 | val plotData = points.zipWithIndex.map(x => Array(x._2.toDouble, -x._1(0) / rangeValue)) 35 | // val plotData = points.zipWithIndex.map(x => Array(x._2.toDouble, x._1(0) )) 36 | val canvas: PlotCanvas = LinePlot.plot("Merged Features Index", plotData, Line.Style.DASH, Color.RED); 37 | 38 | 39 | //Verification against DJI 40 | val verificationDataPath = basePath + "PCA_Example_2.csv" 41 | val verificationData = GetDJIFromFile(new File(verificationDataPath)) 42 | val DJIIndex = GetDJIFromFile(new File(verificationDataPath)) 43 | canvas.line("Dow Jones Index", DJIIndex._2, Line.Style.DOT_DASH, Color.BLUE) 44 | 45 | 46 | peer.setContentPane(canvas) 47 | size = new Dimension(700, 400) 48 | 49 | } 50 | 51 | 52 | def GetStockDataFromCSV(file: File): (Array[Date],Array[Array[Double]]) = { 53 | val source = scala.io.Source.fromFile(file) 54 | //Get all the records (minus the header) 55 | val data = source.getLines().drop(1).map(x => GetStockDataFromString(x)).toArray 56 | source.close() 57 | //group all records by date, and sort the groups on date ascending 58 | val groupedByDate = data.groupBy(x => x._1).toArray.sortBy(x => x._1) 59 | //extract the values from the 3-tuple and turn them into an array of tuples: Array[(Date, Array[Double)] 60 | val dateArrayTuples = groupedByDate.map(x => (x._1, x._2.sortBy(x => x._2).map(y => y._3))) 61 | 62 | //turn the tuples into two separate arrays for easier use later on 63 | val dateArray = dateArrayTuples.map(x => x._1).toArray 64 | val doubleArray = dateArrayTuples.map(x => x._2).toArray 65 | 66 | 67 | (dateArray,doubleArray) 68 | } 69 | 70 | def GetStockDataFromString(dataString: String): (Date,String,Double) = { 71 | 72 | //Split the comma separated value string into an array of strings 73 | val dataArray: Array[String] = dataString.split(',') 74 | 75 | val format = new SimpleDateFormat("yyyy-MM-dd") 76 | //Extract the values from the strings 77 | 78 | val date = format.parse(dataArray(0)) 79 | val stock: String = dataArray(1) 80 | val close: Double = dataArray(2).toDouble 81 | 82 | //And return the result in a format that can later easily be used to feed to Smile 83 | (date,stock,close) 84 | } 85 | 86 | 87 | 88 | def GetDJIRecordFromString(dataString: String): (Date,Double) = { 89 | 90 | //Split the comma separated value string into an array of strings 91 | val dataArray: Array[String] = dataString.split(',') 92 | 93 | val format = new SimpleDateFormat("yyyy-MM-dd") 94 | //Extract the values from the strings 95 | 96 | val date = format.parse(dataArray(0)) 97 | val close: Double = dataArray(4).toDouble 98 | 99 | //And return the result in a format that can later easily be used to feed to Smile 100 | (date,close) 101 | } 102 | 103 | 104 | def GetDJIFromFile(file: File): (Array[Date],Array[Double]) = { 105 | val source = scala.io.Source.fromFile(file) 106 | //Get all the records (minus the header) 107 | val data = source.getLines().drop(1).map(x => GetDJIRecordFromString(x)).toArray 108 | source.close() 109 | 110 | //turn the tuples into two separate arrays for easier use later on 111 | val sortedData = data.sortBy(x => x._1) 112 | val dates = sortedData.map(x => x._1) 113 | val maxDouble = sortedData.maxBy(x => x._2)._2 114 | val minDouble = sortedData.minBy(x => x._2)._2 115 | val rangeValue = maxDouble - minDouble 116 | val doubles = sortedData.map(x => x._2 / rangeValue ) 117 | 118 | 119 | 120 | (dates, doubles) 121 | } 122 | } -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/.name: -------------------------------------------------------------------------------- 1 | RecommendationSystem -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/joda_time_joda_time_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/libraries/smile_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/RecommendationSystem.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | would 55 | should 56 | could 57 | ought 58 | im 59 | youre 60 | hes 61 | shes 62 | were 63 | theyre 64 | ive 65 | youve 66 | weve 67 | theyve 68 | id 69 | youd 70 | hed 71 | shed 72 | wed 73 | theyd 74 | ill 75 | youll 76 | hell 77 | shell 78 | well 79 | theyll 80 | isnt 81 | arent 82 | wasnt 83 | werent 84 | hasnt 85 | havent 86 | hadnt 87 | doesnt 88 | dont 89 | didnt 90 | wont 91 | wouldnt 92 | shant 93 | shouldnt 94 | cant 95 | cannot 96 | couldnt 97 | mustnt 98 | lets 99 | thats 100 | whos 101 | whats 102 | heres 103 | theres 104 | whens 105 | wheres 106 | whys 107 | hows 108 | a 109 | an 110 | the 111 | and 112 | but 113 | if 114 | or 115 | because 116 | as 117 | until 118 | while 119 | of 120 | at 121 | by 122 | for 123 | with 124 | about 125 | against 126 | between 127 | into 128 | through 129 | during 130 | before 131 | after 132 | above 133 | below 134 | to 135 | from 136 | up 137 | down 138 | in 139 | out 140 | on 141 | off 142 | over 143 | under 144 | again 145 | further 146 | then 147 | once 148 | here 149 | there 150 | when 151 | where 152 | why 153 | how 154 | all 155 | any 156 | both 157 | each 158 | few 159 | more 160 | most 161 | other 162 | some 163 | such 164 | no 165 | nor 166 | not 167 | only 168 | own 169 | same 170 | so 171 | than 172 | too 173 | very 174 | tr 175 | td 176 | 177 | 178 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Xyclade.ml 8 | RecommendationSystem 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | com.github.haifengl 14 | smile-core 15 | 1.0.3 16 | 17 | 18 | com.github.haifengl 19 | smile-plot 20 | 1.0.2 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/projectFilesBackup/RecommendationSystem.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/RecommendationSystem/src/main/java/RecommendationSystem.scala: -------------------------------------------------------------------------------- 1 | import java.awt.{Rectangle} 2 | import java.io.File 3 | import java.text.SimpleDateFormat 4 | import java.util.Date 5 | import smile.plot.BarPlot 6 | 7 | import scala.swing.{MainFrame, SimpleSwingApplication} 8 | import scala.util.Try 9 | 10 | object RecommendationSystem extends SimpleSwingApplication { 11 | 12 | 13 | case class EmailData(emailDate : Date, sender : String, subject : String, body : String) 14 | 15 | def top = new MainFrame { 16 | title = "Recommendation System Example from http://xyclade.ml" 17 | 18 | val basePath = "data" 19 | val easyHamPath = basePath + "/easy_ham" 20 | 21 | try 22 | { 23 | val mails = getFilesFromDir(easyHamPath).map(x => getFullEmail(x)) 24 | val timeSortedMails = mails 25 | .map(x => EmailData(getDateFromEmail(x), getSenderFromEmail(x), getSubjectFromEmail(x), getMessageBodyFromEmail(x))) 26 | .sortBy(x => x.emailDate) 27 | 28 | val (trainingData, testingData) = timeSortedMails 29 | .splitAt(timeSortedMails.length / 2) 30 | 31 | 32 | //First we group the emails by Sender, then we extract only the sender address and amount of emails, and finally we sort them on amounts ascending 33 | val mailsGroupedBySender = trainingData 34 | .groupBy(x => x.sender) 35 | .map(x => (x._1, Math.log1p(x._2.length))) 36 | .toArray 37 | .sortBy(x => x._2) 38 | 39 | //In order to plot the data we split the values from the addresses as this is how the plotting library accepts the data. 40 | val senderDescriptions = mailsGroupedBySender.map(x => x._1) 41 | val senderValues = mailsGroupedBySender.map(x => x._2.toDouble) 42 | 43 | val barPlot = BarPlot.plot("", senderValues, senderDescriptions) 44 | 45 | //Rotate the email addresses by -80 degrees such that we can read them 46 | barPlot.getAxis(0).setRotation(-1.3962634) 47 | barPlot.setAxisLabel(0, "") 48 | barPlot.setAxisLabel(1, "Amount of emails received on log Scale ") 49 | peer.setContentPane(barPlot) 50 | 51 | bounds = new Rectangle(800, 600) 52 | 53 | val mailsGroupedByThread = trainingData 54 | .groupBy(x => x.subject) 55 | 56 | //Create a list of tuples with (subject, list of emails) 57 | val threadBarPlotData = mailsGroupedByThread 58 | .map(x => (x._1, Math.log1p(x._2.length))) 59 | .toArray 60 | .sortBy(x => x._2) 61 | 62 | val threadDescriptions = threadBarPlotData 63 | .map(x => x._1) 64 | val threadValues = threadBarPlotData 65 | .map(x => x._2.toDouble) 66 | 67 | val mailGroupsWithMinMaxDates = mailsGroupedByThread 68 | .map(x => (x._1, x._2, (x._2 69 | .maxBy(x => x.emailDate) 70 | .emailDate.getTime - x._2.minBy(x => x.emailDate).emailDate.getTime 71 | ) / 1000)) 72 | 73 | //turn into a list of tuples with (topic, list of emails, time difference, and weight) filtered that only threads occur 74 | val threadGroupsWithWeights = mailGroupsWithMinMaxDates 75 | .filter(x => x._3 != 0) 76 | .map(x => (x._1, x._2, x._3, 10 + Math.log10(x._2.length.toDouble / x._3))) 77 | 78 | 79 | val stopWords = getStopWords 80 | 81 | val threadTermWeights = threadGroupsWithWeights 82 | .toArray 83 | .sortBy(x => x._4) 84 | .flatMap(x => x._1 85 | .replaceAll("[^a-zA-Z ]", "") 86 | .toLowerCase.split(" ") 87 | .filter(_.nonEmpty) 88 | .map(y => (y, x._4))) 89 | 90 | val filteredThreadTermWeights = threadTermWeights 91 | .groupBy(x => x._1) 92 | .map(x => (x._1, x._2.maxBy(y => y._2)._2)) 93 | .toArray.sortBy(x => x._1) 94 | .filter(x => !stopWords.contains(x._1)) 95 | 96 | 97 | val tdm = trainingData 98 | .flatMap(x => x.body.split(" ")) 99 | .filter(x => x.nonEmpty && !stopWords.contains(x)) 100 | .groupBy(x => x) 101 | .map(x => (x._1, Math.log10(x._2.length + 1))) 102 | .filter(x => x._2 != 0) 103 | 104 | 105 | val trainingRanks = trainingData.map(mail => { 106 | //mail contains (full content, date, sender, subject, body) 107 | 108 | //Determine the weight of the sender 109 | val senderWeight = mailsGroupedBySender 110 | .collectFirst { case (mail.sender, x) => x + 1} 111 | .getOrElse(1.0) 112 | 113 | //Determine the weight of the subject 114 | val termsInSubject = mail.subject 115 | .replaceAll("[^a-zA-Z ]", "") 116 | .toLowerCase.split(" ") 117 | .filter(x => x.nonEmpty && !stopWords.contains(x)) 118 | 119 | val termWeight = if (termsInSubject.size > 0) termsInSubject 120 | .map(x => { 121 | tdm.collectFirst { case (y, z) if y == x => z + 1} 122 | .getOrElse(1.0) 123 | }) 124 | .sum / termsInSubject.length 125 | else 1.0 126 | 127 | //Determine if the email is from a thread, and if it is the weight from this thread: 128 | val threadGroupWeight: Double = threadGroupsWithWeights 129 | .collectFirst { case (mail.subject, _, _, weight) => weight} 130 | .getOrElse(1.0) 131 | 132 | //Determine the commonly used terms in the email and the weight belonging to it: 133 | val termsInMailBody = mail.body 134 | .replaceAll("[^a-zA-Z ]", "") 135 | .toLowerCase.split(" ") 136 | .filter(x => x.nonEmpty && !stopWords.contains(x)) 137 | 138 | val commonTermsWeight = if (termsInMailBody.size > 0) termsInMailBody 139 | .map(x => { 140 | tdm.collectFirst { case (y, z) if y == x => z + 1} 141 | .getOrElse(1.0) 142 | }) 143 | .sum / termsInMailBody.length 144 | else 1.0 145 | 146 | val rank = termWeight * threadGroupWeight * commonTermsWeight * senderWeight 147 | 148 | (mail, rank) 149 | }) 150 | 151 | val sortedTrainingRanks = trainingRanks.sortBy(x => x._2) 152 | 153 | val median = sortedTrainingRanks(sortedTrainingRanks.length / 2)._2 154 | val mean = sortedTrainingRanks.map(x => x._2).sum / sortedTrainingRanks.length 155 | println("Median:" + median + " Mean:" + mean) 156 | 157 | 158 | val testingRanks = testingData.map(mail => { 159 | //mail contains (full content, date, sender, subject, body) 160 | 161 | //Determine the weight of the sender 162 | val senderWeight = mailsGroupedBySender 163 | .collectFirst { case (mail.sender, x) => x +1} 164 | .getOrElse(1.0) 165 | 166 | //Determine the weight of the subject 167 | val termsInSubject = mail.subject 168 | .replaceAll("[^a-zA-Z ]", "") 169 | .toLowerCase.split(" ") 170 | .filter(x => x.nonEmpty && !stopWords.contains(x)) 171 | 172 | val termWeight = if (termsInSubject.size > 0) termsInSubject 173 | .map(x => { 174 | tdm.collectFirst { case (y, z) if y == x => z + 1} 175 | .getOrElse(1.0) 176 | }) 177 | .sum / termsInSubject.length 178 | else 1.0 179 | 180 | //Determine if the email is from a thread, and if it is the weight from this thread: 181 | val threadGroupWeight: Double = threadGroupsWithWeights 182 | .collectFirst { case (mail.subject, _, _, weight) => weight} 183 | .getOrElse(1.0) 184 | 185 | //Determine the commonly used terms in the email and the weight belonging to it: 186 | val termsInMailBody = mail.body 187 | .replaceAll("[^a-zA-Z ]", "") 188 | .toLowerCase.split(" ") 189 | .filter(x => x.nonEmpty && !stopWords.contains(x)) 190 | 191 | val commonTermsWeight = if (termsInMailBody.size > 0) termsInMailBody 192 | .map(x => { 193 | tdm.collectFirst { case (y, z) if y == x => z + 1} 194 | .getOrElse(1.0) 195 | }) 196 | .sum / termsInMailBody.length 197 | else 1.0 198 | 199 | val rank = termWeight * threadGroupWeight * commonTermsWeight * senderWeight 200 | 201 | (mail, rank, termWeight,threadGroupWeight,commonTermsWeight,senderWeight) 202 | }) 203 | 204 | val priorityEmails = testingRanks.filter(x => x._2 >= mean/2).sortBy(x => -x._2) 205 | val df = new java.text.DecimalFormat("#.##") 206 | 207 | println("|Date | Sender | Subject | Rank | thread term | thread time | common terms | sender |") 208 | println("| :--- | : -- | :-- | :-- | :-- | :-- | :-- | :-- | ") 209 | priorityEmails.take(10).foreach(x => println("| " + x._1.emailDate + " | " + x._1.sender + " | " + x._1.subject + " | " + df.format(x._2) + " |"+ df.format(x._3) + " |"+ df.format(x._4) + " |"+ df.format(x._5) + " |"+ df.format(x._6) + " |")) 210 | 211 | 212 | println(priorityEmails.length + " ranked as priority") 213 | 214 | } 215 | catch 216 | { 217 | case e: Exception => println("You probably are missing the sample data. You can download these from the spamassasin corpus (mentioned in the example on http://xyclade.ml) and place them in the directory 'data' in this project. Check the exception for more details: " + e); 218 | } 219 | } 220 | 221 | def getFilesFromDir(path: String): List[File] = { 222 | val d = new File(path) 223 | if (d.exists && d.isDirectory) { 224 | //Remove the mac os basic storage file, and alternatively for unix systems "cmds" 225 | d.listFiles.filter(x => x.isFile && !x.toString.contains(".DS_Store") && !x.toString.contains("cmds")).toList 226 | } else { 227 | List[File]() 228 | } 229 | } 230 | 231 | 232 | def getFullEmail(file: File): String = { 233 | //Note that the encoding of the example files is latin1, thus this should be passed to the from file method. 234 | val source = scala.io.Source.fromFile(file)("latin1") 235 | val fullEmail = source.getLines mkString "\n" 236 | source.close() 237 | 238 | fullEmail 239 | } 240 | 241 | 242 | def getSubjectFromEmail(email: String): String = { 243 | 244 | //Find the index of the end of the subject line 245 | val subjectIndex = email.indexOf("Subject:") 246 | val endOfSubjectIndex = email.substring(subjectIndex).indexOf('\n') + subjectIndex 247 | 248 | //Extract the subject: start of subject + 7 (length of Subject:) until the end of the line. 249 | val subject = email 250 | .substring(subjectIndex + 8, endOfSubjectIndex) 251 | .trim 252 | .toLowerCase 253 | 254 | //Additionally, we check whether the email was a response and remove the 're: ' tag, to make grouping on topic easier: 255 | subject.replace("re: ", "") 256 | } 257 | 258 | def getMessageBodyFromEmail(email: String): String = { 259 | 260 | val firstLineBreak = email.indexOf("\n\n") 261 | //Return the message body filtered by only text from a-z and to lower case 262 | email.substring(firstLineBreak) 263 | .replace("\n", " ") 264 | .replaceAll("[^a-zA-Z ]", "") 265 | .toLowerCase 266 | } 267 | 268 | 269 | def getSenderFromEmail(email: String): String = { 270 | //Find the index of the From: line 271 | val fromLineIndex = email.indexOf("From:") 272 | val endOfLine = email.substring(fromLineIndex).indexOf('\n') + fromLineIndex 273 | 274 | //Search for the <> tags in this line, as if they are there, the email address is contained inside these tags 275 | val mailAddressStartIndex = email 276 | .substring(fromLineIndex, endOfLine) 277 | .indexOf('<') + fromLineIndex + 1 278 | val mailAddressEndIndex = email 279 | .substring(fromLineIndex, endOfLine) 280 | .indexOf('>') + fromLineIndex 281 | 282 | if (mailAddressStartIndex > mailAddressEndIndex) { 283 | 284 | //The email address was not embedded in <> tags, extract the substring without extra spacing and to lower case 285 | var emailString = email 286 | .substring(fromLineIndex + 5, endOfLine) 287 | .trim 288 | .toLowerCase 289 | 290 | //Remove a possible name embedded in () at the end of the line, for example in test@test.com (tester) the name would be removed here 291 | val additionalNameStartIndex = emailString.indexOf('(') 292 | if (additionalNameStartIndex == -1) { 293 | emailString 294 | .toLowerCase 295 | } 296 | else { 297 | emailString 298 | .substring(0, additionalNameStartIndex) 299 | .trim 300 | .toLowerCase 301 | } 302 | } 303 | else { 304 | //Extract the email address from the tags. If these <> tags are there, there is no () with a name in the From: string in our data 305 | email 306 | .substring(mailAddressStartIndex, mailAddressEndIndex) 307 | .trim 308 | .toLowerCase 309 | } 310 | } 311 | 312 | def getDateFromEmail(email: String): Date = { 313 | //Find the index of the Date: line in the complete email 314 | val dateLineIndex = email.indexOf("Date:") 315 | val endOfDateLine = email.substring(dateLineIndex).indexOf('\n') + dateLineIndex 316 | 317 | //All possible date patterns in the emails. 318 | val datePatterns = Array("EEE MMM dd HH:mm:ss yyyy", "EEE, dd MMM yyyy HH:mm", "dd MMM yyyy HH:mm:ss", "EEE MMM dd yyyy HH:mm") 319 | 320 | datePatterns.foreach { x => 321 | //Try to directly return a date from the formatting.when it fails on a pattern it continues with the next one until one works 322 | Try(return new SimpleDateFormat(x).parse(email.substring(dateLineIndex + 5, endOfDateLine).trim.substring(0, x.length))) 323 | } 324 | //Finally, if all failed return null (this will not happen with our example data but without this return the code will not compile) 325 | null 326 | } 327 | 328 | def getStopWords: List[String] = { 329 | val source = scala.io.Source.fromFile(new File("data/stopwords.txt"))("latin1") 330 | val lines = source.mkString.split("\n") 331 | source.close() 332 | lines.toList 333 | } 334 | } 335 | 336 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/.name: -------------------------------------------------------------------------------- 1 | SVM -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /Code Examples/SVM/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/SVM/SVM.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Code Examples/SVM/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | Xyclade.ml 8 | SVM 9 | 1.0-SNAPSHOT 10 | 11 | 12 | com.github.haifengl 13 | smile-core 14 | 1.0.3 15 | 16 | 17 | com.github.haifengl 18 | smile-plot 19 | 1.0.2 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Code Examples/SVM/projectFilesBackup/SVM.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/SVM/src/main/java/SVM_Example_2.scala: -------------------------------------------------------------------------------- 1 | import java.awt.{Color, Dimension} 2 | import java.io.{ File} 3 | import java.text.DecimalFormat 4 | 5 | import smile.math.kernel.{PolynomialKernel, GaussianKernel} 6 | import smile.plot.ScatterPlot 7 | import smile.classification.SVM 8 | import scala.collection.mutable 9 | import scala.swing.{SimpleSwingApplication, MainFrame} 10 | 11 | object SVM_Example_2 extends SimpleSwingApplication { 12 | 13 | 14 | def top = new MainFrame { 15 | title = "SVM Example 2" 16 | //File path (this changes per example) 17 | val trainingPath = "data/SVM_Example_2.csv" 18 | val testingPath = "data/SVM_Example_2_Test_data.csv" 19 | val df = new DecimalFormat("#.#") 20 | //Loading of the test data and plot generation stays the same 21 | val trainingData = GetDataFromCSV(new File(trainingPath)) 22 | val testingData = GetDataFromCSV(new File(testingPath)) 23 | 24 | 25 | val plot = ScatterPlot.plot(trainingData._1, trainingData._2, '@', Array(Color.blue, Color.green)) 26 | peer.setContentPane(plot) 27 | 28 | val printlist = mutable.MutableList[(Int,Double,Double)]() 29 | 30 | 31 | val sigmas = Array(2,3,4,5) 32 | val marginPenalties = Array(0.001,0.01,0.1,0.2,0.5,1.0,2.0,3.0,10.0,100) 33 | 34 | 35 | 36 | sigmas.foreach( sigma => 37 | marginPenalties.foreach(marginPenalty => { 38 | val svm = new SVM[Array[Double]](new PolynomialKernel(sigma), marginPenalty, 2) 39 | svm.learn(trainingData._1, trainingData._2) 40 | svm.finish() 41 | 42 | 43 | //Calculate how well the SVM predicts on the training set 44 | val predictions = testingData._1.map(x => svm.predict(x)).zip(testingData._2) 45 | val falsePredictions = predictions.map(x => if (x._1 == x._2) 0 else 1) 46 | 47 | val result = (sigma ,marginPenalty , (falsePredictions.sum.toDouble / predictions.length * 100)) 48 | printlist += result 49 | println("degree: " + sigma + " margin: " + marginPenalty + " error: " + result._3) 50 | } 51 | ) 52 | ) 53 | 54 | print("| |") 55 | sigmas.foreach(x => print(" s: " + x + " |")) 56 | println("") 57 | println("| :-- | :--: | :--: | :--: | :--: | :--: | :--: | :--: | ") 58 | marginPenalties.foreach(x => { 59 | val sigmaValues = sigmas.map(y => printlist.filter(z => z._1 == y && z._2 == x)(0)._3) 60 | println("") 61 | print("| **c: " + x + "** |") 62 | sigmaValues.foreach(s => print(" " + df.format(s) + "% |") 63 | ) 64 | }) 65 | 66 | 67 | 68 | size = new Dimension(400, 400) 69 | } 70 | 71 | 72 | def GetDataFromCSV(file: File): (Array[Array[Double]], Array[Int]) = { 73 | val source = scala.io.Source.fromFile(file) 74 | val data = source.getLines().drop(1).map(x => GetDataFromString(x)).toArray 75 | source.close() 76 | val dataPoints = data.map(x => x._1) 77 | val classifierArray = data.map(x => x._2) 78 | return (dataPoints, classifierArray) 79 | } 80 | 81 | def GetDataFromString(dataString: String): (Array[Double], Int) = { 82 | //Split the comma separated value string into an array of strings 83 | val dataArray: Array[String] = dataString.split(',') 84 | 85 | //Extract the values from the strings 86 | val coordinates = Array( dataArray(0).toDouble, dataArray(1).toDouble) 87 | val classifier: Int = dataArray(2).toInt 88 | 89 | //And return the result in a format that can later easily be used to feed to Smile 90 | return (coordinates, classifier) 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /Code Examples/SVM/src/main/java/SupportVectorMachine.scala: -------------------------------------------------------------------------------- 1 | import java.awt.{Color, Dimension} 2 | import java.io.{ File} 3 | import java.text.DecimalFormat 4 | 5 | import smile.math.kernel.{ GaussianKernel} 6 | import smile.plot.ScatterPlot 7 | import smile.classification.SVM 8 | import scala.collection.mutable 9 | import scala.swing.{SimpleSwingApplication, MainFrame} 10 | 11 | object SupportVectorMachine extends SimpleSwingApplication { 12 | 13 | 14 | def top = new MainFrame { 15 | title = "SVM Example 1" 16 | //File path (this changes per example) 17 | val path = "data/SVM_Example_1.csv" 18 | val df = new DecimalFormat("#.#") 19 | //Loading of the test data and plot generation stays the same 20 | val testData = GetDataFromCSV(new File(path)) 21 | val plot = ScatterPlot.plot(testData._1, testData._2, '@', Array(Color.blue, Color.green)) 22 | peer.setContentPane(plot) 23 | 24 | //Here we do our SVM fine tuning with possibly different kernels 25 | //val svm = new SVM[Array[Double]](new GaussianKernel(0.01), 10.0,2) 26 | val printlist = mutable.MutableList[(Double,Double,Double)]() 27 | 28 | 29 | val sigmas = Array(0.001,0.01,0.1,0.2,0.5,1.0,2.0,3.0,10.0,100) 30 | val marginPenalties = Array(0.001,0.01,0.1,0.2,0.5,1.0,2.0,3.0,10.0,100) 31 | 32 | 33 | 34 | sigmas.foreach( sigma => 35 | marginPenalties.foreach(marginPenalty => { 36 | val svm = new SVM[Array[Double]](new GaussianKernel(sigma), marginPenalty, 2) 37 | svm.learn(testData._1, testData._2) 38 | svm.finish() 39 | 40 | 41 | //Calculate how well the SVM predicts on the training set 42 | val predictions = testData._1.map(x => svm.predict(x)).zip(testData._2) 43 | val falsePredictions = predictions.map(x => if (x._1 == x._2) 0 else 1) 44 | 45 | val result = (sigma ,marginPenalty , (falsePredictions.sum.toDouble / predictions.length * 100)) 46 | printlist += result 47 | println("sigma: " + sigma + " margin: " + marginPenalty + " error: " + result._3) 48 | } 49 | ) 50 | ) 51 | 52 | print("| |") 53 | sigmas.foreach(x => print(" s: " + x + " |")) 54 | println("") 55 | println("| :-- | :--: | :--: | :--: | :--: | :--: | :--: | :--: | ") 56 | marginPenalties.foreach(x => { 57 | val sigmaValues = sigmas.map(y => printlist.filter(z => z._1 == y && z._2 == x)(0)._3) 58 | println("") 59 | print("| **c: " + x + "** |") 60 | sigmaValues.foreach(s => print(" " + df.format(s) + "% |") 61 | ) 62 | }) 63 | 64 | 65 | 66 | size = new Dimension(400, 400) 67 | } 68 | 69 | 70 | def GetDataFromCSV(file: File): (Array[Array[Double]], Array[Int]) = { 71 | val source = scala.io.Source.fromFile(file) 72 | val data = source.getLines().drop(1).map(x => GetDataFromString(x)).toArray 73 | source.close() 74 | val dataPoints = data.map(x => x._1) 75 | val classifierArray = data.map(x => x._2) 76 | return (dataPoints, classifierArray) 77 | } 78 | 79 | def GetDataFromString(dataString: String): (Array[Double], Int) = { 80 | //Split the comma separated value string into an array of strings 81 | val dataArray: Array[String] = dataString.split(',') 82 | 83 | //Extract the values from the strings 84 | val coordinates = Array( dataArray(0).toDouble, dataArray(1).toDouble) 85 | val classifier: Int = dataArray(2).toInt 86 | 87 | //And return the result in a format that can later easily be used to feed to Smile 88 | return (coordinates, classifier) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/.name: -------------------------------------------------------------------------------- 1 | TextRegression -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 32 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__com_github_haifengl_smile_core_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__com_github_haifengl_smile_data_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__com_github_haifengl_smile_graph_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__com_github_haifengl_smile_math_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__com_github_haifengl_smile_plot_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__com_github_tototoshi_scala_csv_2_11_1_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__org_scala_lang_scala_library_2_11_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/Maven__org_swinglabs_swingx_swingx_all_1_6_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/com_github_tototoshi_scala_csv_2_11_1_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/libraries/smile_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/TextRegression.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/data/TextRegression_Example_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Code Examples/TextRegression/data/TextRegression_Example_1.csv -------------------------------------------------------------------------------- /Code Examples/TextRegression/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | would 55 | should 56 | could 57 | ought 58 | im 59 | youre 60 | hes 61 | shes 62 | were 63 | theyre 64 | ive 65 | youve 66 | weve 67 | theyve 68 | id 69 | youd 70 | hed 71 | shed 72 | wed 73 | theyd 74 | ill 75 | youll 76 | hell 77 | shell 78 | well 79 | theyll 80 | isnt 81 | arent 82 | wasnt 83 | werent 84 | hasnt 85 | havent 86 | hadnt 87 | doesnt 88 | dont 89 | didnt 90 | wont 91 | wouldnt 92 | shant 93 | shouldnt 94 | cant 95 | cannot 96 | couldnt 97 | mustnt 98 | lets 99 | thats 100 | whos 101 | whats 102 | heres 103 | theres 104 | whens 105 | wheres 106 | whys 107 | hows 108 | a 109 | an 110 | the 111 | and 112 | but 113 | if 114 | or 115 | because 116 | as 117 | until 118 | while 119 | of 120 | at 121 | by 122 | for 123 | with 124 | about 125 | against 126 | between 127 | into 128 | through 129 | during 130 | before 131 | after 132 | above 133 | below 134 | to 135 | from 136 | up 137 | down 138 | in 139 | out 140 | on 141 | off 142 | over 143 | under 144 | again 145 | further 146 | then 147 | once 148 | here 149 | there 150 | when 151 | where 152 | why 153 | how 154 | all 155 | any 156 | both 157 | each 158 | few 159 | more 160 | most 161 | other 162 | some 163 | such 164 | no 165 | nor 166 | not 167 | only 168 | own 169 | same 170 | so 171 | than 172 | too 173 | very 174 | tr 175 | td 176 | 177 | 178 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | xyclade.ml 8 | TextRegression 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 13 | com.github.haifengl 14 | smile-core 15 | 1.0.2 16 | 17 | 18 | com.github.haifengl 19 | smile-plot 20 | 1.0.2 21 | 22 | 23 | com.github.tototoshi 24 | scala-csv_2.11 25 | 1.2.1 26 | 27 | 28 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/projectFilesBackup/TextRegression.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Code Examples/TextRegression/src/main/java/DTM.scala: -------------------------------------------------------------------------------- 1 | import java.io.File 2 | import scala.collection.mutable 3 | 4 | class DTM { 5 | 6 | var records: List[DTMRecord] = List[DTMRecord]() 7 | var wordList: List[String] = List[String]() 8 | 9 | def addDocumentToRecords(documentName: String, rank: Int, documentContent: String) = { 10 | //Find a record for the document 11 | val record = records.find(x => x.document == documentName) 12 | if (record.nonEmpty) { 13 | throw new Exception("Document already exists in the records") 14 | } 15 | 16 | var wordRecords = mutable.HashMap[String, Int]() 17 | val individualWords = documentContent.toLowerCase.split(" ") 18 | individualWords.foreach { x => 19 | val wordRecord = wordRecords.find(y => y._1 == x) 20 | if (wordRecord.nonEmpty) { 21 | wordRecords += x -> (wordRecord.get._2 + 1) 22 | } 23 | else { 24 | wordRecords += x -> 1 25 | wordList = x :: wordList 26 | } 27 | } 28 | records = new DTMRecord(documentName, rank, wordRecords) :: records 29 | } 30 | 31 | def getStopWords(): List[String] = { 32 | val source = scala.io.Source.fromFile(new File("data/stopwords.txt"))("latin1") 33 | val lines = source.mkString.split("\n") 34 | source.close() 35 | return lines.toList 36 | } 37 | 38 | def getNumericRepresentationForRecords(): (Array[Array[Double]], Array[Double]) = { 39 | //First filter out all stop words: 40 | val StopWords = getStopWords() 41 | wordList = wordList.filter(x => !StopWords.contains(x)) 42 | 43 | var dtmNumeric = Array[Array[Double]]() 44 | var ranks = Array[Double]() 45 | 46 | records.foreach { x => 47 | //Add the rank to the array of ranks 48 | ranks = ranks :+ x.rank.toDouble 49 | 50 | //And create an array representing all words and their occurrences 51 | //for this document: 52 | var dtmNumericRecord: Array[Double] = Array() 53 | wordList.foreach { y => 54 | 55 | val termRecord = x.occurrences.find(z => z._1 == y) 56 | if (termRecord.nonEmpty) { 57 | dtmNumericRecord = dtmNumericRecord :+ termRecord.get._2.toDouble 58 | } 59 | else { 60 | dtmNumericRecord = dtmNumericRecord :+ 0.0 61 | } 62 | } 63 | dtmNumeric = dtmNumeric :+ dtmNumericRecord 64 | 65 | } 66 | 67 | return (dtmNumeric, ranks) 68 | } 69 | } 70 | 71 | class DTMRecord(val document : String, 72 | val rank : Int, 73 | var occurrences : mutable.HashMap[String,Int] 74 | ) -------------------------------------------------------------------------------- /Code Examples/TextRegression/src/main/java/TextRegression.scala: -------------------------------------------------------------------------------- 1 | import java.awt.Color 2 | import java.io.File 3 | import java.util.Calendar 4 | import com.github.tototoshi.csv._ 5 | import smile.plot._ 6 | import smile.regression.{LASSO, RidgeRegression} 7 | import smile.validation.CrossValidation 8 | object TextRegression { 9 | 10 | def main(args: Array[String]): Unit = { 11 | 12 | 13 | //Get the example data 14 | val basePath = "data/TextRegression_Example_1.csv" 15 | val testData = GetDataFromCSV(new File(basePath)) 16 | 17 | //Create a document term matrix for the data 18 | val documentTermMatrix = new DTM() 19 | testData.foreach(x => documentTermMatrix.addDocumentToRecords(x._1, x._2, x._3)) 20 | 21 | //Get the cross validation data 22 | val cv = new CrossValidation(testData.length, 2) 23 | val numericDTM = documentTermMatrix.getNumericRepresentationForRecords 24 | 25 | for (i <- 0 until cv.k) { 26 | //Split off the training datapoints and classifiers from the dataset 27 | val dpForTraining = numericDTM._1.zipWithIndex.filter(x => cv.test(i).toList.contains(x._2)).map(y => y._1) 28 | val classifiersForTraining = numericDTM._2.zipWithIndex.filter(x => cv.test(i).toList.contains(x._2)).map(y => y._1) 29 | 30 | //And the corresponding subset of data points and their classifiers for testing 31 | val dpForTesting = numericDTM._1.zipWithIndex.filter(x => !cv.test(i).contains(x._2)).map(y => y._1) 32 | val classifiersForTesting = numericDTM._2.zipWithIndex.filter(x => !cv.test(i).contains(x._2)).map(y => y._1) 33 | 34 | //These are the lambda values we will verify against 35 | val lambdas: Array[Double] = Array(0.1, 0.25, 0.5, 1.0, 2.0, 5.0) 36 | 37 | lambdas.foreach { x => 38 | //Define a new model based on the training data and one of the lambda's 39 | val model = new LASSO(dpForTraining, classifiersForTraining, x) 40 | 41 | //Compute the RMSE for this model with this lambda 42 | val results = dpForTesting.map(y => model.predict(y)) zip classifiersForTesting 43 | val RMSE = Math.sqrt(results.map(x => Math.pow(x._1 - x._2, 2)).sum / results.length) 44 | println(Calendar.getInstance().getTime + "Lambda: " + x + " RMSE: " + RMSE) 45 | 46 | } 47 | } 48 | } 49 | 50 | def GetDataFromCSV(file: File) : List[(String,Int,String)]= { 51 | val reader = CSVReader.open(file) 52 | val data = reader.all() 53 | 54 | val documents = data.drop(1).map(x => (x(1),x(3).toInt,x(4))) 55 | documents 56 | } 57 | 58 | 59 | 60 | 61 | 62 | } 63 | -------------------------------------------------------------------------------- /Example Data/KNN_Example_1.csv: -------------------------------------------------------------------------------- 1 | "X","Y","Label" 2 | 2.37354618925767,5.39810588036707,0 3 | 3.18364332422208,4.38797360674923,0 4 | 2.16437138758995,5.34111969142442,0 5 | 4.59528080213779,3.87063690391921,0 6 | 3.32950777181536,6.43302370170104,0 7 | 2.17953161588198,6.98039989850586,0 8 | 3.48742905242849,4.63277852353349,0 9 | 3.73832470512922,3.95586537368347,0 10 | 3.57578135165349,5.56971962744241,0 11 | 2.69461161284364,4.86494539611918,0 12 | 4.51178116845085,7.40161776050478,0 13 | 3.38984323641143,4.96075999726683,0 14 | 2.3787594194582,5.68973936245078,0 15 | 0.7853001128225,5.02800215878067,0 16 | 4.12493091814311,4.25672679111759,0 17 | 2.95506639098477,5.18879229951434,0 18 | 2.98380973690105,3.19504137110896,0 19 | 3.9438362106853,6.46555486156289,0 20 | 3.82122119509809,5.1532533382119,0 21 | 3.59390132121751,7.17261167036215,0 22 | 3.91897737160822,5.47550952889966,0 23 | 3.78213630073107,4.29005356907819,0 24 | 3.07456498336519,5.61072635348905,0 25 | 1.01064830413663,4.06590236835575,0 26 | 3.61982574789471,3.7463665997609,0 27 | 2.943871260471,5.29144623551746,0 28 | 2.84420449329467,4.55670812678157,0 29 | 1.52924761610073,5.00110535163162,0 30 | 2.52184994489138,5.07434132415166,0 31 | 3.4179415601997,4.41047905381193,0 32 | 4.35867955152904,4.4313312671815,0 33 | 2.897212272657,4.86482138487617,0 34 | 3.38767161155937,6.1780869965732,0 35 | 2.94619495941709,3.47643319957024,0 36 | 1.62294044317139,5.59394618762842,0 37 | 2.58500543670032,5.33295037121352,0 38 | 2.60571004628965,6.06309983727636,0 39 | 2.94068660328881,4.6958160763657,0 40 | 4.10002537198388,5.37001880991629,0 41 | 3.76317574845754,5.26709879077223,0 42 | 2.83547640374641,4.45747996900835,0 43 | 2.74663831986349,6.20786780598317,0 44 | 3.69696337540474,6.16040261569495,0 45 | 3.55666319867366,5.700213649515,0 46 | 2.31124430545048,6.58683345454085,0 47 | 2.29250484303788,5.5584864255653,0 48 | 3.36458196213683,3.72340779154196,0 49 | 3.76853292451542,4.42673458576311,0 50 | 2.88765378784977,3.77538738510164,0 51 | 3.88110772645421,4.52659936356069,0 52 | 4.37963332277588,7.45018710127266,1 53 | 5.04211587314424,6.98144016728536,1 54 | 4.08907835144755,6.68193162545616,1 55 | 5.15802877240407,6.0706378525463,1 56 | 4.34541535608118,5.51253968985852,1 57 | 6.76728726937265,5.92480770338432,1 58 | 5.71670747601721,8.00002880371391,1 59 | 5.91017422949523,6.37873330520318,1 60 | 5.38418535782634,5.61557315261551,1 61 | 6.68217608051942,8.86929062242358,1 62 | 4.36426354605102,7.42510037737245,1 63 | 4.53835526963943,6.76135289908697,1 64 | 6.43228223854166,8.05848304870902,1 65 | 4.34930364668963,7.88642265137494,1 66 | 4.79261925639803,6.38075695176885,1 67 | 4.60719207055802,9.20610246454047,1 68 | 4.68000713145149,6.74497296985898,1 69 | 4.72088669702344,5.57550534978719,1 70 | 5.49418833126783,6.85560039804578,1 71 | 4.82266951773039,7.20753833923234,1 72 | 4.49404253788574,9.30797839905936,1 73 | 6.34303882517041,7.10580236789371,1 74 | 4.78542059145313,7.45699880542341,1 75 | 4.82044346995661,6.92284706464347,1 76 | 4.89980925878644,6.66599915763346,1 77 | 5.71266630705141,6.96527397168872,1 78 | 4.92643559587367,7.78763960563016,1 79 | 4.96236582853295,9.07524500865228,1 80 | 4.31833952124434,8.02739243876377,1 81 | 4.67572972775368,8.2079083983867,1 82 | 5.06016044043452,5.76867657844196,1 83 | 4.41110551374034,7.98389557005338,1 84 | 5.53149619263257,7.21992480366065,1 85 | 3.48160591821321,5.53274997090776,1 86 | 5.30655786078977,7.52102274264814,1 87 | 3.46355017646241,6.84124539528398,1 88 | 4.69902387316339,8.4645873119698,1 89 | 4.47172009555499,6.23391800039534,1 90 | 4.347905219319,6.56978824607145,1 91 | 4.94310322215261,6.07389050262256,1 92 | 3.08564057431999,6.82289603856346,1 93 | 6.17658331201856,7.40201177948634,1 94 | 3.335027563788,6.26825182688039,1 95 | 4.53646959852761,7.83037316798167,1 96 | 3.88407989495715,5.79191721369553,1 97 | 4.24918099880655,5.95201558719226,1 98 | 7.08716654562835,8.44115770684428,1 99 | 5.01739561969325,5.98415253469535,1 100 | 3.71369946956567,7.41197471231752,1 101 | 3.35939446558142,6.61892394889108,1 102 | -------------------------------------------------------------------------------- /Example Data/Recommendation_Example_1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Example Data/Recommendation_Example_1.zip -------------------------------------------------------------------------------- /Example Data/TextRegression_Example_1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Example Data/TextRegression_Example_1.csv -------------------------------------------------------------------------------- /Example Data/stopwords.txt: -------------------------------------------------------------------------------- 1 | i 2 | me 3 | my 4 | myself 5 | we 6 | our 7 | ours 8 | ourselves 9 | you 10 | your 11 | yours 12 | yourself 13 | yourselves 14 | he 15 | him 16 | his 17 | himself 18 | she 19 | her 20 | hers 21 | herself 22 | it 23 | its 24 | itself 25 | they 26 | them 27 | their 28 | theirs 29 | themselves 30 | what 31 | which 32 | who 33 | whom 34 | this 35 | that 36 | these 37 | those 38 | am 39 | is 40 | are 41 | was 42 | were 43 | be 44 | been 45 | being 46 | have 47 | has 48 | had 49 | having 50 | do 51 | does 52 | did 53 | doing 54 | would 55 | should 56 | could 57 | ought 58 | im 59 | youre 60 | hes 61 | shes 62 | were 63 | theyre 64 | ive 65 | youve 66 | weve 67 | theyve 68 | id 69 | youd 70 | hed 71 | shed 72 | wed 73 | theyd 74 | ill 75 | youll 76 | hell 77 | shell 78 | well 79 | theyll 80 | isnt 81 | arent 82 | wasnt 83 | werent 84 | hasnt 85 | havent 86 | hadnt 87 | doesnt 88 | dont 89 | didnt 90 | wont 91 | wouldnt 92 | shant 93 | shouldnt 94 | cant 95 | cannot 96 | couldnt 97 | mustnt 98 | lets 99 | thats 100 | whos 101 | whats 102 | heres 103 | theres 104 | whens 105 | wheres 106 | whys 107 | hows 108 | a 109 | an 110 | the 111 | and 112 | but 113 | if 114 | or 115 | because 116 | as 117 | until 118 | while 119 | of 120 | at 121 | by 122 | for 123 | with 124 | about 125 | against 126 | between 127 | into 128 | through 129 | during 130 | before 131 | after 132 | above 133 | below 134 | to 135 | from 136 | up 137 | down 138 | in 139 | out 140 | on 141 | off 142 | over 143 | under 144 | again 145 | further 146 | then 147 | once 148 | here 149 | there 150 | when 151 | where 152 | why 153 | how 154 | all 155 | any 156 | both 157 | each 158 | few 159 | more 160 | most 161 | other 162 | some 163 | such 164 | no 165 | nor 166 | not 167 | only 168 | own 169 | same 170 | so 171 | than 172 | too 173 | very 174 | tr 175 | td 176 | 177 | 178 | -------------------------------------------------------------------------------- /Images/DynamicMachineLearning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/DynamicMachineLearning.png -------------------------------------------------------------------------------- /Images/Formula1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Formula1.png -------------------------------------------------------------------------------- /Images/Formula2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Formula2.png -------------------------------------------------------------------------------- /Images/Formula3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Formula3.png -------------------------------------------------------------------------------- /Images/Formula4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Formula4.png -------------------------------------------------------------------------------- /Images/Good_Fit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Good_Fit.png -------------------------------------------------------------------------------- /Images/Ham_No_Stopwords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Ham_No_Stopwords.png -------------------------------------------------------------------------------- /Images/Ham_Stopwords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Ham_Stopwords.png -------------------------------------------------------------------------------- /Images/HumanDataPoints.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/HumanDataPoints.png -------------------------------------------------------------------------------- /Images/KNNPlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/KNNPlot.png -------------------------------------------------------------------------------- /Images/Mail_per_Sender_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Mail_per_Sender_Distribution.png -------------------------------------------------------------------------------- /Images/Mail_per_Sender_log_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Mail_per_Sender_log_Distribution.png -------------------------------------------------------------------------------- /Images/Mail_per_Subject_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Mail_per_Subject_Distribution.png -------------------------------------------------------------------------------- /Images/Mail_per_Subject_log_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Mail_per_Subject_log_Distribution.png -------------------------------------------------------------------------------- /Images/MaleFemalePlot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/MaleFemalePlot.png -------------------------------------------------------------------------------- /Images/OverFitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/OverFitting.png -------------------------------------------------------------------------------- /Images/PCA_Explanatory_Data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/PCA_Explanatory_Data.png -------------------------------------------------------------------------------- /Images/PCA_Normalised.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/PCA_Normalised.png -------------------------------------------------------------------------------- /Images/PCA_Reduced_Dimension.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/PCA_Reduced_Dimension.png -------------------------------------------------------------------------------- /Images/Precision.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Precision.png -------------------------------------------------------------------------------- /Images/PrecisionFull.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/PrecisionFull.png -------------------------------------------------------------------------------- /Images/PrecisionHalf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/PrecisionHalf.png -------------------------------------------------------------------------------- /Images/Recall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Recall.png -------------------------------------------------------------------------------- /Images/RecallFull.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/RecallFull.png -------------------------------------------------------------------------------- /Images/RecallHalf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/RecallHalf.png -------------------------------------------------------------------------------- /Images/SVM_Datapoints.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/SVM_Datapoints.png -------------------------------------------------------------------------------- /Images/SVM_TestData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/SVM_TestData.png -------------------------------------------------------------------------------- /Images/SVM_TrainData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/SVM_TrainData.png -------------------------------------------------------------------------------- /Images/Spam_No_Stopwords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Spam_No_Stopwords.png -------------------------------------------------------------------------------- /Images/Spam_Stopwords.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Spam_Stopwords.png -------------------------------------------------------------------------------- /Images/Under-fitting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Under-fitting.png -------------------------------------------------------------------------------- /Images/Unscaled_DJI_PCA_Index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Unscaled_DJI_PCA_Index.png -------------------------------------------------------------------------------- /Images/Unscaled_PCA_Index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Unscaled_PCA_Index.png -------------------------------------------------------------------------------- /Images/Weighted_Subject_Distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Xyclade/MachineLearning/a01a1c3acc8fa6f534ab4765f201e6dab78aae04/Images/Weighted_Subject_Distribution.png --------------------------------------------------------------------------------