├── Chapter01 ├── CleaningData.java ├── FileListing.java ├── JsonReading.java ├── JsonWriting.java ├── JsoupTesting.java ├── TestDB.java ├── TestJdom.java ├── TestRecursiveDirectoryTraversal.java ├── TestTika.java ├── TestTsv.java ├── TestUnivocity.java ├── TextFileReadApache.java ├── TextFileReadJava.java └── WebdataExtractionSelenium.java ├── Chapter02 ├── IndexFiles.java └── SearchFiles.java ├── Chapter03 └── chap3 │ └── java │ └── science │ └── data │ ├── AggregateStats.java │ ├── AnovaTest.java │ ├── ChiSquareTest.java │ ├── CovarianceTest.java │ ├── DescriptiveStats.java │ ├── FrequencyStats.java │ ├── GLSRegressionTest.java │ ├── KSTest.java │ ├── OLSRegressionTest.java │ ├── PearsonTest.java │ ├── RegressionTest.java │ ├── SummaryStats.java │ ├── TTest.java │ ├── WordFrequencyStatsApache.java │ └── WordFrequencyStatsJava.java ├── Chapter04 └── Code │ ├── B05916_04_01.png │ ├── B05916_04_02.png │ ├── WekaArffTest.java │ ├── WekaAssociationRuleTest.java │ ├── WekaCVTest.java │ ├── WekaClassesToClusterTest.java │ ├── WekaClusterTest.java │ ├── WekaFeatureSelectionTest.java │ ├── WekaFilteredClassifierTest.java │ ├── WekaLinearRegressionTest.java │ ├── WekaLogisticRegressionTest.java │ └── WekaTrainTest.java ├── Chapter05 └── chapter-5 │ ├── JavaMachineLearning.java │ ├── MOA.java │ ├── Mulan.java │ └── StanfordClassifier.java ├── Chapter06 ├── CosineSimilarity.java ├── Lemmatizer.java ├── OpenNlpSenToken.java ├── SentenceDetection.java ├── WekaClassification.java └── WordDetection.java ├── Chapter07 └── Code │ ├── KMeansClusteringMlib.java │ ├── LinearRegressionMlib.java │ ├── OnlineLogisticRegressionTest.java │ ├── OnlineLogisticRegressionTrain.java │ ├── RandomForestMlib.java │ └── ScalaTest.java ├── Chapter08 ├── Chap-08-Code.rar └── Chap-08-Code │ └── Code │ ├── DBNIrisExample.java │ ├── DeepAutoEncoderExample.java │ └── Word2VecRawTextExample.java ├── Chapter09 └── data │ ├── AreaPlot.java │ ├── HistogramPlot.java │ ├── ScatterPlot.java │ ├── SimpleBarPlot.java │ ├── SimpleBoxPlot.java │ ├── SimplePiePlot.java │ └── SineGraph.java ├── LICENSE └── README.md /Chapter01/CleaningData.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | public class CleaningData { 4 | public static void main(String[] args) throws Exception { 5 | CleaningData clean = new CleaningData(); 6 | String text = "Your text here you have got from some file"; 7 | String cleanedText = clean.cleanText(text); 8 | //Process cleanedText 9 | } 10 | 11 | public String cleanText(String text){ 12 | text = text.replaceAll("[^\\p{ASCII}]",""); 13 | text = text.replaceAll("\\s+", " "); 14 | text = text.replaceAll("\\p{Cntrl}", ""); 15 | text = text.replaceAll("[^\\p{Print}]", ""); 16 | text = text.replaceAll("\\p{C}", ""); 17 | return text; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Chapter01/FileListing.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.File; 4 | import java.util.List; 5 | import org.apache.commons.io.FileUtils; 6 | import org.apache.commons.io.filefilter.TrueFileFilter; 7 | 8 | public class FileListing{ 9 | public static void main (String[] args){ 10 | FileListing fileListing = new FileListing(); 11 | fileListing.listFiles("Path for the root directory here"); 12 | } 13 | public void listFiles(String rootDir){ 14 | File dir = new File(rootDir); 15 | 16 | List files = (List) FileUtils.listFiles(dir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE); 17 | for (File file : files) { 18 | System.out.println("file: " + file.getAbsolutePath()); 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /Chapter01/JsonReading.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | 4 | import java.io.FileNotFoundException; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.util.Iterator; 8 | import org.json.simple.JSONArray; 9 | import org.json.simple.JSONObject; 10 | import org.json.simple.parser.JSONParser; 11 | import org.json.simple.parser.ParseException; 12 | 13 | public class JsonReading { 14 | public static void main(String[] args){ 15 | JsonReading jsonReading = new JsonReading(); 16 | jsonReading.readJson("C:/testJSON.json"); 17 | } 18 | public void readJson(String inFileName) { 19 | JSONParser parser = new JSONParser(); 20 | try { 21 | Object obj = parser.parse(new FileReader(inFileName)); 22 | JSONObject jsonObject = (JSONObject) obj; 23 | 24 | String name = (String) jsonObject.get("book"); 25 | System.out.println(name); 26 | 27 | String author = (String) jsonObject.get("author"); 28 | System.out.println(author); 29 | 30 | JSONArray reviews = (JSONArray) jsonObject.get("messages"); 31 | Iterator iterator = reviews.iterator(); 32 | while (iterator.hasNext()) { 33 | System.out.println(iterator.next()); 34 | } 35 | } catch (FileNotFoundException e) { 36 | //Your exception handling here 37 | } catch (IOException e) { 38 | //Your exception handling here 39 | } catch (ParseException e) { 40 | //Your exception handling here 41 | } 42 | } 43 | } 44 | 45 | -------------------------------------------------------------------------------- /Chapter01/JsonWriting.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.FileWriter; 4 | import java.io.IOException; 5 | import org.json.simple.JSONArray; 6 | import org.json.simple.JSONObject; 7 | 8 | public class JsonWriting { 9 | 10 | public static void main(String[] args) { 11 | JsonWriting jsonWriting = new JsonWriting(); 12 | jsonWriting.writeJson("C:/testJSON.json"); 13 | } 14 | 15 | public void writeJson(String outFileName){ 16 | JSONObject obj = new JSONObject(); 17 | obj.put("book", "Harry Potter and the Philosopher's Stone"); 18 | obj.put("author", "J. K. Rowling"); 19 | 20 | JSONArray list = new JSONArray(); 21 | list.add("There are characters in this book that will remind us of all the people we have met. Everybody knows or knew a spoilt, overweight boy like Dudley or a bossy and interfering (yet kind-hearted) girl like Hermione"); 22 | list.add("Hogwarts is a truly magical place, not only in the most obvious way but also in all the detail that the author has gone to describe it so vibrantly."); 23 | list.add("Parents need to know that this thrill-a-minute story, the first in the Harry Potter series, respects kids' intelligence and motivates them to tackle its greater length and complexity, play imaginative games, and try to solve its logic puzzles. "); 24 | 25 | obj.put("messages", list); 26 | 27 | try { 28 | 29 | FileWriter file = new FileWriter(outFileName); 30 | file.write(obj.toJSONString()); 31 | file.flush(); 32 | file.close(); 33 | 34 | } catch (IOException e) { 35 | e.printStackTrace(); 36 | } 37 | 38 | System.out.print(obj); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /Chapter01/JsoupTesting.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.IOException; 4 | 5 | import org.jsoup.Jsoup; 6 | import org.jsoup.nodes.Document; 7 | import org.jsoup.nodes.Element; 8 | import org.jsoup.select.Elements; 9 | 10 | public class JsoupTesting { 11 | public static void main(String[] args){ 12 | JsoupTesting test = new JsoupTesting(); 13 | test.extractDataWithJsoup("http://www.sustainalytics.com"); 14 | } 15 | 16 | public void extractDataWithJsoup(String href){ 17 | Document doc = null; 18 | try { 19 | doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get(); 20 | } catch (IOException e) { 21 | //Your exception handling here 22 | } 23 | if(doc != null){ 24 | String title = doc.title(); 25 | String text = doc.body().text(); 26 | Elements links = doc.select("a[href]"); 27 | for (Element link : links) { 28 | String linkHref = link.attr("href"); 29 | String linkText = link.text(); 30 | String linkOuterHtml = link.outerHtml(); 31 | String linkInnerHtml = link.html(); 32 | } 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Chapter01/TestDB.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.sql.*; 4 | import com.mysql.jdbc.jdbc2.optional.MysqlDataSource; 5 | public class TestDB{ 6 | public static void main(String[] args){ 7 | TestDB test = new TestDB(); 8 | test.readTable("your user name", "your password", "your MySQL server name"); 9 | } 10 | public void readTable(String user, String password, String server){ 11 | MysqlDataSource dataSource = new MysqlDataSource(); 12 | dataSource.setUser(user); 13 | dataSource.setPassword(password); 14 | dataSource.setServerName(server); 15 | try{ 16 | Connection conn = dataSource.getConnection(); 17 | Statement stmt = conn.createStatement(); 18 | ResultSet rs = stmt.executeQuery("SELECT * FROM data_science.books"); 19 | while (rs.next()){ 20 | int id = rs.getInt("id"); 21 | String book = rs.getString("book_name"); 22 | String author = rs.getString("author_name"); 23 | Date dateCreated = rs.getDate("date_created"); 24 | System.out.format("%s, %s, %s, %s\n", id, book, author, dateCreated); 25 | } 26 | rs.close(); 27 | stmt.close(); 28 | conn.close(); 29 | }catch (Exception e){ 30 | //Your exception handling mechanism goes here. 31 | } 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /Chapter01/TestJdom.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.List; 6 | 7 | import org.jdom2.Document; 8 | import org.jdom2.Element; 9 | import org.jdom2.JDOMException; 10 | import org.jdom2.input.SAXBuilder; 11 | 12 | public class TestJdom { 13 | 14 | public static void main(String[] args){ 15 | TestJdom test = new TestJdom(); 16 | test.parseXml("C:/dummyxml.com"); 17 | 18 | } 19 | public void parseXml(String fileName){ 20 | SAXBuilder builder = new SAXBuilder(); 21 | File file = new File(fileName); 22 | try { 23 | Document document = (Document) builder.build(file); 24 | Element rootNode = document.getRootElement(); 25 | List list = rootNode.getChildren("author"); 26 | for (int i = 0; i < list.size(); i++) { 27 | Element node = (Element) list.get(i); 28 | System.out.println("First Name : " + node.getChildText("firstname")); 29 | System.out.println("Last Name : " + node.getChildText("lastname")); 30 | } 31 | } catch (IOException io) { 32 | System.out.println(io.getMessage()); 33 | } catch (JDOMException jdomex) { 34 | System.out.println(jdomex.getMessage()); 35 | } 36 | 37 | 38 | 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /Chapter01/TestRecursiveDirectoryTraversal.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.File; 4 | import java.util.HashSet; 5 | import java.util.Set; 6 | 7 | public class TestRecursiveDirectoryTraversal { 8 | public static void main(String[] args){ 9 | System.out.println(listFiles(new File("")).size()); 10 | } 11 | 12 | public static Set listFiles(File rootDir) { 13 | Set fileSet = new HashSet(); 14 | if(rootDir == null || rootDir.listFiles()==null){ 15 | return fileSet; 16 | } 17 | for (File fileOrDir : rootDir.listFiles()) { 18 | if (fileOrDir.isFile()){ 19 | fileSet.add(fileOrDir); 20 | } 21 | else{ 22 | fileSet.addAll(listFiles(fileOrDir)); 23 | } 24 | } 25 | 26 | return fileSet; 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Chapter01/TestTika.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import org.apache.tika.metadata.Metadata; 7 | import org.apache.tika.parser.AutoDetectParser; 8 | import org.apache.tika.parser.ParseContext; 9 | import org.apache.tika.sax.BodyContentHandler; 10 | 11 | public class TestTika { 12 | public static void main(String args[]) throws Exception { 13 | TestTika tika = new TestTika(); 14 | tika.convertPdf("C:/testPDF.pdf"); 15 | } 16 | public void convertPdf(String fileName){ 17 | InputStream stream = null; 18 | try { 19 | stream = new FileInputStream(fileName); 20 | AutoDetectParser parser = new AutoDetectParser(); 21 | BodyContentHandler handler = new BodyContentHandler(-1); 22 | Metadata metadata = new Metadata(); 23 | parser.parse(stream, handler, metadata, new ParseContext()); 24 | System.out.println(handler.toString()); 25 | }catch (Exception e) { 26 | e.printStackTrace(); 27 | }finally { 28 | if (stream != null) 29 | try { 30 | stream.close(); 31 | } catch (IOException e) { 32 | System.out.println("Error closing stream"); 33 | } 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Chapter01/TestTsv.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.File; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import com.univocity.parsers.tsv.TsvParser; 8 | import com.univocity.parsers.tsv.TsvParserSettings; 9 | 10 | public class TestTsv { 11 | public void parseTsv(String fileName){ 12 | TsvParserSettings settings = new TsvParserSettings(); 13 | settings.getFormat().setLineSeparator("\n"); 14 | TsvParser parser = new TsvParser(settings); 15 | List allRows = parser.parseAll(new File(fileName)); 16 | for (int i = 0; i < allRows.size(); i++){ 17 | System.out.println(Arrays.asList(allRows.get(i))); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Chapter01/TestUnivocity.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | 4 | import java.io.File; 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import com.univocity.parsers.common.processor.RowListProcessor; 9 | import com.univocity.parsers.csv.CsvParser; 10 | import com.univocity.parsers.csv.CsvParserSettings; 11 | 12 | public class TestUnivocity { 13 | public void parseCSV(String fileName){ 14 | CsvParserSettings parserSettings = new CsvParserSettings(); 15 | parserSettings.setLineSeparatorDetectionEnabled(true); 16 | RowListProcessor rowProcessor = new RowListProcessor(); 17 | parserSettings.setRowProcessor(rowProcessor); 18 | parserSettings.setHeaderExtractionEnabled(true); 19 | CsvParser parser = new CsvParser(parserSettings); 20 | parser.parse(new File(fileName)); 21 | 22 | String[] headers = rowProcessor.getHeaders(); 23 | List rows = rowProcessor.getRows(); 24 | for (int i = 0; i < rows.size(); i++){ 25 | System.out.println(Arrays.asList(rows.get(i))); 26 | } 27 | } 28 | 29 | public static void main(String[] args){ 30 | TestUnivocity test = new TestUnivocity(); 31 | test.parseCSV("C:/testCSV.csv"); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Chapter01/TextFileReadApache.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | 6 | import org.apache.commons.io.FileUtils; 7 | 8 | public class TextFileReadApache { 9 | public static void main(String[] args){ 10 | TextFileReadApache test = new TextFileReadApache(); 11 | test.readFile("C:/dummy.txt"); 12 | 13 | } 14 | public void readFile(String fileName){ 15 | File file = new File(fileName); 16 | String text = ""; 17 | try { 18 | text = FileUtils.readFileToString(file, "UTF-8"); 19 | } catch (IOException e) { 20 | System.out.println("Error reading " + file.getAbsolutePath()); 21 | } 22 | //process text 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /Chapter01/TextFileReadJava.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import java.io.IOException; 4 | import java.nio.file.Files; 5 | import java.nio.file.Paths; 6 | import java.util.stream.Stream; 7 | 8 | public class TextFileReadJava { 9 | public static void main(String[] args){ 10 | TextFileReadJava test = new TextFileReadJava(); 11 | test.readTextFile("C:/dummy.txt"); 12 | } 13 | public void readTextFile(String file){ 14 | try (Stream stream = Files.lines(Paths.get(file))) { 15 | stream.forEach(System.out::println); 16 | } catch (IOException e) { 17 | //Your exception handling here 18 | } 19 | 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /Chapter01/WebdataExtractionSelenium.java: -------------------------------------------------------------------------------- 1 | package chap1.java.science.data; 2 | 3 | import org.openqa.selenium.By; 4 | import org.openqa.selenium.WebDriver; 5 | import org.openqa.selenium.WebElement; 6 | import org.openqa.selenium.firefox.FirefoxDriver; 7 | 8 | public class WebdataExtractionSelenium { 9 | public static void main(String[] args) { 10 | WebDriver driver = new FirefoxDriver(); 11 | driver.get("http://cogenglab.csd.uwo.ca/rushdi.htm"); 12 | 13 | WebElement webElement = driver.findElement(By.xpath("//*[@id='content']")); 14 | System.out.println(webElement.getText()); 15 | 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Chapter02/IndexFiles.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.demo; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 5 | import org.apache.lucene.document.Document; 6 | import org.apache.lucene.document.Field; 7 | import org.apache.lucene.document.LongPoint; 8 | import org.apache.lucene.document.StringField; 9 | import org.apache.lucene.document.TextField; 10 | import org.apache.lucene.index.IndexWriter; 11 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 12 | import org.apache.lucene.index.IndexWriterConfig; 13 | import org.apache.lucene.index.Term; 14 | import org.apache.lucene.store.Directory; 15 | import org.apache.lucene.store.FSDirectory; 16 | 17 | import java.io.BufferedReader; 18 | import java.io.IOException; 19 | import java.io.InputStream; 20 | import java.io.InputStreamReader; 21 | import java.nio.charset.StandardCharsets; 22 | import java.nio.file.FileVisitResult; 23 | import java.nio.file.Files; 24 | import java.nio.file.Path; 25 | import java.nio.file.Paths; 26 | import java.nio.file.SimpleFileVisitor; 27 | import java.nio.file.attribute.BasicFileAttributes; 28 | import java.util.Date; 29 | 30 | public class IndexFiles { 31 | static void indexDocs(final IndexWriter writer, Path path) throws IOException { 32 | if (Files.isDirectory(path)) { 33 | Files.walkFileTree(path, new SimpleFileVisitor() { 34 | @Override 35 | public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { 36 | try { 37 | indexDoc(writer, file, attrs.lastModifiedTime().toMillis()); 38 | } catch (IOException ignore) { 39 | } 40 | return FileVisitResult.CONTINUE; 41 | } 42 | } 43 | ); 44 | } else { 45 | indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis()); 46 | } 47 | } 48 | 49 | static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException { 50 | try (InputStream stream = Files.newInputStream(file)) { 51 | Document doc = new Document(); 52 | Field pathField = new StringField("path", file.toString(), Field.Store.YES); 53 | doc.add(pathField); 54 | doc.add(new LongPoint("modified", lastModified)); 55 | doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)))); 56 | 57 | if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { 58 | System.out.println("adding " + file); 59 | writer.addDocument(doc); 60 | } else { 61 | System.out.println("updating " + file); 62 | writer.updateDocument(new Term("path", file.toString()), doc); 63 | } 64 | } 65 | } 66 | public static void main(String[] args) { 67 | String indexPath = "index"; 68 | String docsPath = null; 69 | boolean create = true; 70 | for(int i=0;i classes = new ArrayList(); 19 | classes.add(calorie); 20 | classes.add(fat); 21 | classes.add(carb); 22 | classes.add(control); 23 | 24 | System.out.println(TestUtils.oneWayAnovaFValue(classes)); // F-value 25 | System.out.println(TestUtils.oneWayAnovaPValue(classes)); // P-value 26 | System.out.println(TestUtils.oneWayAnovaTest(classes, 0.05)); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/ChiSquareTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.inference.TestUtils; 4 | 5 | public class ChiSquareTest { 6 | public static void main(String[] args){ 7 | long[] observed = {43, 21, 25, 42, 57, 59}; 8 | double[] expected = {99, 65, 79, 75, 87, 81}; 9 | ChiSquareTest test = new ChiSquareTest(); 10 | test.getChiSquare(observed, expected); 11 | } 12 | public void getChiSquare(long[] observed, double[] expected){ 13 | System.out.println(TestUtils.chiSquare(expected, observed));//t statistics 14 | System.out.println(TestUtils.chiSquareTest(expected, observed));//p value 15 | System.out.println(TestUtils.chiSquareTest(expected, observed, 0.05)); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/CovarianceTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.correlation.Covariance; 4 | 5 | public class CovarianceTest { 6 | public static void main(String[] args){ 7 | double[] x = {43, 21, 25, 42, 57, 59}; 8 | double[] y = {99, 65, 79, 75, 87, 81}; 9 | CovarianceTest test = new CovarianceTest(); 10 | test.calculateCov(x, y); 11 | } 12 | public void calculateCov(double[] x, double[] y){ 13 | double covariance = new Covariance().covariance(x, y, false);//take out false too 14 | System.out.println(covariance); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/DescriptiveStats.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; 4 | 5 | public class DescriptiveStats { 6 | public static void main(String[] args){ 7 | double[] values = {32, 39, 14, 98, 45, 44, 45, 34, 89, 67, 0, 15, 0, 56, 88}; 8 | DescriptiveStats descStatTest = new DescriptiveStats(); 9 | descStatTest.getDescStats(values); 10 | 11 | } 12 | public void getDescStats(double[] values){ 13 | DescriptiveStatistics stats = new DescriptiveStatistics(); 14 | for( int i = 0; i < values.length; i++) { 15 | stats.addValue(values[i]); 16 | } 17 | double mean = stats.getMean(); 18 | double std = stats.getStandardDeviation(); 19 | double median = stats.getPercentile(50); 20 | System.out.println(mean + "\t" + std + "\t" + median); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/FrequencyStats.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.Frequency; 4 | 5 | public class FrequencyStats { 6 | public static void main(String[] args){ 7 | double[] values = {32, 39, 14, 98, 45, 44, 45, 34, 89, 67, 0, 15, 0, 56, 88}; 8 | FrequencyStats freqTest = new FrequencyStats(); 9 | freqTest.getFreqStats(values); 10 | 11 | } 12 | public void getFreqStats(double[] values){ 13 | Frequency freq = new Frequency(); 14 | for( int i = 0; i < values.length; i++) { 15 | freq.addValue(values[i]); 16 | } 17 | 18 | for( int i = 0; i < values.length; i++) { 19 | System.out.println(freq.getCount(values[i])); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/GLSRegressionTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.regression.GLSMultipleLinearRegression; 4 | 5 | public class GLSRegressionTest { 6 | public static void main(String[] args){ 7 | double[] y = new double[]{11.0, 12.0, 13.0, 14.0, 15.0, 16.0}; 8 | double[][] x = new double[6][]; 9 | x[0] = new double[]{0, 0, 0, 0, 0}; 10 | x[1] = new double[]{2.0, 0, 0, 0, 0}; 11 | x[2] = new double[]{0, 3.0, 0, 0, 0}; 12 | x[3] = new double[]{0, 0, 4.0, 0, 0}; 13 | x[4] = new double[]{0, 0, 0, 5.0, 0}; 14 | x[5] = new double[]{0, 0, 0, 0, 6.0}; 15 | double[][] omega = new double[6][]; 16 | omega[0] = new double[]{1.1, 0, 0, 0, 0, 0}; 17 | omega[1] = new double[]{0, 2.2, 0, 0, 0, 0}; 18 | omega[2] = new double[]{0, 0, 3.3, 0, 0, 0}; 19 | omega[3] = new double[]{0, 0, 0, 4.4, 0, 0}; 20 | omega[4] = new double[]{0, 0, 0, 0, 5.5, 0}; 21 | omega[5] = new double[]{0, 0, 0, 0, 0, 6.6}; 22 | GLSRegressionTest test = new GLSRegressionTest(); 23 | test.calculateOlsRegression(x, y, omega); 24 | } 25 | public void calculateOlsRegression(double[][] x, double[] y, double[][] omega){ 26 | GLSMultipleLinearRegression regression = new GLSMultipleLinearRegression(); 27 | regression.newSampleData(y, x, omega); 28 | 29 | double[] beta = regression.estimateRegressionParameters(); 30 | double[] residuals = regression.estimateResiduals(); 31 | double[][] parametersVariance = regression.estimateRegressionParametersVariance(); 32 | double regressandVariance = regression.estimateRegressandVariance(); 33 | double sigma = regression.estimateRegressionStandardError(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/KSTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.inference.TestUtils; 4 | 5 | public class KSTest { 6 | public static void main(String[] args){ 7 | double[] x = {43, 21, 25, 42, 57, 59}; 8 | double[] y = {99, 65, 79, 75, 87, 81}; 9 | KSTest test = new KSTest(); 10 | test.calculateKs(x, y); 11 | } 12 | public void calculateKs(double[] x, double[] y){ 13 | double d = TestUtils.kolmogorovSmirnovStatistic(x, y); 14 | System.out.println(TestUtils.kolmogorovSmirnovTest(x, y, false)); 15 | System.out.println(TestUtils.exactP(d, x.length, y.length, false)); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/OLSRegressionTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression; 4 | 5 | public class OLSRegressionTest { 6 | public static void main(String[] args){ 7 | double[] y = new double[]{11.0, 12.0, 13.0, 14.0, 15.0, 16.0}; 8 | double[][] x = new double[6][]; 9 | x[0] = new double[]{0, 0, 0, 0, 0}; 10 | x[1] = new double[]{2.0, 0, 0, 0, 0}; 11 | x[2] = new double[]{0, 3.0, 0, 0, 0}; 12 | x[3] = new double[]{0, 0, 4.0, 0, 0}; 13 | x[4] = new double[]{0, 0, 0, 5.0, 0}; 14 | x[5] = new double[]{0, 0, 0, 0, 6.0}; 15 | OLSRegressionTest test = new OLSRegressionTest(); 16 | test.calculateOlsRegression(x, y); 17 | } 18 | public void calculateOlsRegression(double[][] x, double[] y){ 19 | OLSMultipleLinearRegression regression = new OLSMultipleLinearRegression(); 20 | regression.newSampleData(y, x); 21 | 22 | double[] beta = regression.estimateRegressionParameters(); 23 | double[] residuals = regression.estimateResiduals(); 24 | double[][] parametersVariance = regression.estimateRegressionParametersVariance(); 25 | double regressandVariance = regression.estimateRegressandVariance(); 26 | double rSquared = regression.calculateRSquared(); 27 | double sigma = regression.estimateRegressionStandardError(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/PearsonTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.correlation.PearsonsCorrelation; 4 | 5 | public class PearsonTest { 6 | public static void main(String[] args){ 7 | double[] x = {43, 21, 25, 42, 57, 59}; 8 | double[] y = {99, 65, 79, 75, 87, 81}; 9 | PearsonTest test = new PearsonTest(); 10 | test.calculatePearson(x, y); 11 | } 12 | public void calculatePearson(double[] x, double[] y){ 13 | PearsonsCorrelation pCorrelation = new PearsonsCorrelation(); 14 | double cor = pCorrelation.correlation(x, y);//take out false too 15 | System.out.println(cor); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/RegressionTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.regression.SimpleRegression; 4 | 5 | public class RegressionTest { 6 | 7 | public static void main(String[] args){ 8 | double[][] data = { { 1, 3 }, {2, 5 }, {3, 7 }, {4, 14 }, {5, 11 }}; 9 | RegressionTest test = new RegressionTest(); 10 | test.calculateRegression(data); 11 | } 12 | public void calculateRegression(double[][] data){ 13 | SimpleRegression regression = new SimpleRegression(); 14 | regression.addData(data); 15 | System.out.println(regression.getIntercept()); 16 | System.out.println(regression.getSlope()); 17 | System.out.println(regression.getSlopeStdErr()); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/SummaryStats.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.descriptive.SummaryStatistics; 4 | 5 | public class SummaryStats { 6 | public static void main(String[] args){ 7 | double[] values = {32, 39, 14, 98, 45, 44, 45, 34, 89, 67, 0, 15, 0, 56, 88}; 8 | SummaryStats summaryStatTest = new SummaryStats(); 9 | summaryStatTest.getSummaryStats(values); 10 | } 11 | public void getSummaryStats(double[] values){ 12 | SummaryStatistics stats = new SummaryStatistics(); 13 | for( int i = 0; i < values.length; i++) { 14 | stats.addValue(values[i]); 15 | } 16 | double mean = stats.getMean(); 17 | double std = stats.getStandardDeviation(); 18 | System.out.println(mean + "\t" + std); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/TTest.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.inference.TestUtils; 4 | 5 | public class TTest { 6 | public static void main(String[] args){ 7 | double[] sample1 = {43, 21, 25, 42, 57, 59}; 8 | double[] sample2 = {99, 65, 79, 75, 87, 81}; 9 | TTest test = new TTest(); 10 | test.getTtest(sample1, sample2); 11 | } 12 | public void getTtest(double[] sample1, double[] sample2){ 13 | System.out.println(TestUtils.pairedT(sample1, sample2));//t statistics 14 | System.out.println(TestUtils.pairedTTest(sample1, sample2));//p value 15 | System.out.println(TestUtils.pairedTTest(sample1, sample2, 0.05)); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/WordFrequencyStatsApache.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import org.apache.commons.math3.stat.Frequency; 4 | 5 | public class WordFrequencyStatsApache { 6 | public static void main(String[] args){ 7 | String str = "Horatio says 'tis but our fantasy, " 8 | + "And will not let belief take hold of him " 9 | + "Touching this dreaded sight, twice seen of us. " 10 | + "Therefore I have entreated him along, 35" 11 | + "With us to watch the minutes of this night, " 12 | + "That, if again this apparition come, " 13 | + "He may approve our eyes and speak to it."; 14 | String[] words = str.toLowerCase().split("\\W+"); 15 | WordFrequencyStatsApache freqTest = new WordFrequencyStatsApache(); 16 | freqTest.getFreqStats(words); 17 | 18 | } 19 | public void getFreqStats(String[] words){ 20 | Frequency freq = new Frequency(); 21 | for( int i = 0; i < words.length; i++) { 22 | freq.addValue(words[i].trim()); 23 | } 24 | 25 | for( int i = 0; i < words.length; i++) { 26 | System.out.println(words[i] + "=" + freq.getCount(words[i])); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /Chapter03/chap3/java/science/data/WordFrequencyStatsJava.java: -------------------------------------------------------------------------------- 1 | package chap3.java.science.data; 2 | 3 | import java.util.Map; 4 | import java.util.stream.Collectors; 5 | import java.util.stream.Stream; 6 | 7 | public class WordFrequencyStatsJava { 8 | public static void main(String[] args){ 9 | String str = "Horatio says 'tis but our fantasy, " 10 | + "And will not let belief take hold of him " 11 | + "Touching this dreaded sight, twice seen of us. " 12 | + "Therefore I have entreated him along, 35" 13 | + "With us to watch the minutes of this night, " 14 | + "That, if again this apparition come, " 15 | + "He may approve our eyes and speak to it."; 16 | 17 | WordFrequencyStatsJava freqTest = new WordFrequencyStatsJava(); 18 | freqTest.getFreqStats(str); 19 | } 20 | public void getFreqStats(String str){ 21 | Stream stream = Stream.of(str.toLowerCase().split("\\W+")).parallel(); 22 | Map wordFreq = stream 23 | .collect(Collectors.groupingBy(String::toString,Collectors.counting())); 24 | wordFreq.forEach((k,v)->System.out.println(k + "=" + v)); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Chapter04/Code/B05916_04_01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Java-Data-Science-Cookbook/b2bf9ef367bf6c04a96e24123e4160b733b7fed9/Chapter04/Code/B05916_04_01.png -------------------------------------------------------------------------------- /Chapter04/Code/B05916_04_02.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Java-Data-Science-Cookbook/b2bf9ef367bf6c04a96e24123e4160b733b7fed9/Chapter04/Code/B05916_04_02.png -------------------------------------------------------------------------------- /Chapter04/Code/WekaArffTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.FileWriter; 5 | import java.util.ArrayList; 6 | 7 | import weka.core.Attribute; 8 | import weka.core.DenseInstance; 9 | import weka.core.Instances; 10 | 11 | public class WekaArffTest { 12 | public static void main(String[] args) throws Exception { 13 | ArrayList attributes; 14 | ArrayList classVals; 15 | Instances data; 16 | double[] values; 17 | 18 | // Set up attributes 19 | attributes = new ArrayList(); 20 | // Numeric attribute 21 | attributes.add(new Attribute("age")); 22 | // String attribute 23 | ArrayList empty = null; 24 | attributes.add(new Attribute("name", empty)); 25 | // Date attribute 26 | attributes.add(new Attribute("dob", "yyyy-MM-dd")); 27 | classVals = new ArrayList(); 28 | for (int i = 0; i < 5; i++){ 29 | classVals.add("class" + (i + 1)); 30 | } 31 | Attribute classVal = new Attribute("class", classVals); 32 | attributes.add(classVal); 33 | 34 | // Create Instances object 35 | data = new Instances("MyRelation", attributes, 0); 36 | 37 | // Data fill up 38 | // First instance 39 | values = new double[data.numAttributes()]; 40 | values[0] = 35; 41 | values[1] = data.attribute(1).addStringValue("John Doe"); 42 | values[2] = data.attribute(2).parseDate("1981-01-20"); 43 | values[3] = classVals.indexOf("class3"); 44 | 45 | // add 46 | data.add(new DenseInstance(1.0, values)); 47 | 48 | // Second instance 49 | values = new double[data.numAttributes()]; // important: needs NEW array! 50 | values[0] = 30; 51 | values[1] = data.attribute(1).addStringValue("Harry Potter"); 52 | values[2] = data.attribute(2).parseDate("1986-07-05"); 53 | values[3] = classVals.indexOf("class1"); 54 | 55 | // add 56 | data.add(new DenseInstance(1.0, values)); 57 | 58 | //writing arff file to disk 59 | BufferedWriter writer = new BufferedWriter(new FileWriter("c:/training.arff")); 60 | writer.write(data.toString()); 61 | writer.close(); 62 | 63 | // Output data 64 | System.out.println(data); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaAssociationRuleTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import weka.associations.Apriori; 4 | import weka.core.Instances; 5 | import weka.core.converters.ConverterUtils.DataSource; 6 | 7 | public class WekaAssociationRuleTest { 8 | Instances superMarket = null; 9 | Apriori apriori; 10 | public void loadArff(String arffInput){ 11 | DataSource source = null; 12 | try { 13 | source = new DataSource(arffInput); 14 | superMarket = source.getDataSet(); 15 | } catch (Exception e1) { 16 | } 17 | } 18 | public void generateRule(){ 19 | apriori = new Apriori(); 20 | try { 21 | // apriori.setNumRules(20); 22 | apriori.buildAssociations(superMarket); 23 | System.out.println(apriori); 24 | } catch (Exception e) { 25 | } 26 | } 27 | public static void main(String args[]){ 28 | WekaAssociationRuleTest test = new WekaAssociationRuleTest(); 29 | test.loadArff("C:\\Program Files\\Weka-3-6\\data\\supermarket.arff"); 30 | test.generateRule(); 31 | } 32 | } -------------------------------------------------------------------------------- /Chapter04/Code/WekaCVTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import java.util.Random; 4 | 5 | import weka.classifiers.Evaluation; 6 | import weka.classifiers.bayes.NaiveBayes; 7 | import weka.core.Instances; 8 | import weka.core.converters.ConverterUtils.DataSource; 9 | 10 | public class WekaCVTest { 11 | Instances iris = null; 12 | NaiveBayes nb; 13 | 14 | public void loadArff(String arffInput){ 15 | DataSource source = null; 16 | try { 17 | source = new DataSource(arffInput); 18 | iris = source.getDataSet(); 19 | if (iris.classIndex() == -1) 20 | iris.setClassIndex(iris.numAttributes() - 1); 21 | } catch (Exception e1) { 22 | } 23 | } 24 | 25 | public void generateModel(){ 26 | nb = new NaiveBayes(); 27 | try { 28 | nb.buildClassifier(iris); 29 | } catch (Exception e) { 30 | 31 | } 32 | } 33 | 34 | public void saveModel(String modelPath){ 35 | try { 36 | weka.core.SerializationHelper.write(modelPath, nb); 37 | } catch (Exception e) { 38 | } 39 | } 40 | 41 | public void crossValidate(){ 42 | Evaluation eval = null; 43 | try { 44 | eval = new Evaluation(iris); 45 | eval.crossValidateModel(nb, iris, 10, new Random(1)); 46 | System.out.println(eval.toSummaryString()); 47 | } catch (Exception e1) { 48 | } 49 | } 50 | 51 | public static void main(String[] args){ 52 | WekaCVTest test = new WekaCVTest(); 53 | test.loadArff("C:/Program Files/Weka-3-6/data/iris.arff"); 54 | test.generateModel(); 55 | test.saveModel("c:/nb.model"); 56 | test.crossValidate(); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaClassesToClusterTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import weka.clusterers.ClusterEvaluation; 4 | import weka.clusterers.EM; 5 | import weka.core.Instances; 6 | import weka.core.converters.ConverterUtils.DataSource; 7 | import weka.filters.Filter; 8 | import weka.filters.unsupervised.attribute.Remove; 9 | 10 | public class WekaClassesToClusterTest { 11 | Instances weather = null; 12 | EM clusterer; 13 | 14 | public void loadArff(String arffInput){ 15 | DataSource source = null; 16 | try { 17 | source = new DataSource(arffInput); 18 | weather = source.getDataSet(); 19 | weather.setClassIndex(weather.numAttributes() - 1); 20 | } catch (Exception e1) { 21 | } 22 | } 23 | 24 | public void generateClassToCluster(){ 25 | Remove filter = new Remove(); 26 | filter.setAttributeIndices("" + (weather.classIndex() + 1)); 27 | try { 28 | filter.setInputFormat(weather); 29 | Instances dataClusterer = Filter.useFilter(weather, filter); 30 | clusterer = new EM(); 31 | clusterer.buildClusterer(dataClusterer); 32 | ClusterEvaluation eval = new ClusterEvaluation(); 33 | eval.setClusterer(clusterer); 34 | eval.evaluateClusterer(weather); 35 | 36 | System.out.println(eval.clusterResultsToString()); 37 | } catch (Exception e) { 38 | } 39 | } 40 | 41 | public static void main(String[] args){ 42 | WekaClassesToClusterTest test = new WekaClassesToClusterTest(); 43 | test.loadArff("C:/Program Files/Weka-3-6/data/weather.nominal.arff"); 44 | test.generateClassToCluster(); 45 | } 46 | } -------------------------------------------------------------------------------- /Chapter04/Code/WekaClusterTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import weka.clusterers.SimpleKMeans; 4 | import weka.core.Instances; 5 | import weka.core.converters.ConverterUtils.DataSource; 6 | 7 | public class WekaClusterTest { 8 | Instances cpu = null; 9 | SimpleKMeans kmeans; 10 | 11 | public void loadArff(String arffInput){ 12 | DataSource source = null; 13 | try { 14 | source = new DataSource(arffInput); 15 | cpu = source.getDataSet(); 16 | } catch (Exception e1) { 17 | } 18 | } 19 | 20 | public void clusterData(){ 21 | kmeans = new SimpleKMeans(); 22 | kmeans.setSeed(10); 23 | try { 24 | kmeans.setPreserveInstancesOrder(true); 25 | kmeans.setNumClusters(10); 26 | kmeans.buildClusterer(cpu); 27 | int[] assignments = kmeans.getAssignments(); 28 | int i = 0; 29 | for(int clusterNum : assignments) { 30 | System.out.printf("Instance %d -> Cluster %d\n", i, clusterNum); 31 | i++; 32 | } 33 | } catch (Exception e1) { 34 | } 35 | } 36 | 37 | public static void main(String[] args) throws Exception{ 38 | WekaClusterTest test = new WekaClusterTest(); 39 | test.loadArff("C:\\Program Files\\Weka-3-6\\data\\cpu.arff"); 40 | test.clusterData(); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaFeatureSelectionTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import java.util.Random; 4 | 5 | import weka.attributeSelection.AttributeSelection; 6 | import weka.attributeSelection.BestFirst; 7 | import weka.attributeSelection.CfsSubsetEval; 8 | import weka.classifiers.Evaluation; 9 | import weka.classifiers.bayes.NaiveBayes; 10 | import weka.classifiers.meta.AttributeSelectedClassifier; 11 | import weka.core.Instances; 12 | import weka.core.Utils; 13 | import weka.core.converters.ConverterUtils.DataSource; 14 | import weka.filters.Filter; 15 | 16 | public class WekaFeatureSelectionTest { 17 | Instances iris = null; 18 | NaiveBayes nb; 19 | public void loadArff(String arffInput){ 20 | DataSource source = null; 21 | try { 22 | source = new DataSource(arffInput); 23 | iris = source.getDataSet(); 24 | iris.setClassIndex(iris.numAttributes() - 1); 25 | } catch (Exception e1) { 26 | } 27 | } 28 | 29 | public void selectFeatures(){ 30 | AttributeSelection attSelection = new AttributeSelection(); 31 | CfsSubsetEval eval = new CfsSubsetEval(); 32 | BestFirst search = new BestFirst(); 33 | attSelection.setEvaluator(eval); 34 | attSelection.setSearch(search); 35 | try { 36 | attSelection.SelectAttributes(iris); 37 | int[] attIndex = attSelection.selectedAttributes(); 38 | System.out.println(Utils.arrayToString(attIndex)); 39 | } catch (Exception e) { 40 | } 41 | } 42 | 43 | public void selectFeaturesWithFilter(){ 44 | weka.filters.supervised.attribute.AttributeSelection filter = new weka.filters.supervised.attribute.AttributeSelection(); 45 | CfsSubsetEval eval = new CfsSubsetEval(); 46 | BestFirst search = new BestFirst(); 47 | filter.setEvaluator(eval); 48 | filter.setSearch(search); 49 | try { 50 | filter.setInputFormat(iris); 51 | Instances newData = Filter.useFilter(iris, filter); 52 | System.out.println(newData); 53 | } catch (Exception e) { 54 | } 55 | } 56 | 57 | public void selectFeaturesWithClassifiers(){ 58 | AttributeSelectedClassifier classifier = new AttributeSelectedClassifier(); 59 | CfsSubsetEval eval = new CfsSubsetEval(); 60 | BestFirst search = new BestFirst(); 61 | nb = new NaiveBayes(); 62 | classifier.setClassifier(nb); 63 | classifier.setEvaluator(eval); 64 | classifier.setSearch(search); 65 | Evaluation evaluation; 66 | try { 67 | evaluation = new Evaluation(iris); 68 | evaluation.crossValidateModel(classifier, iris, 10, new Random(1)); 69 | System.out.println(evaluation.toSummaryString()); 70 | } catch (Exception e) { 71 | } 72 | } 73 | 74 | public static void main(String[] args){ 75 | WekaFeatureSelectionTest test = new WekaFeatureSelectionTest(); 76 | test.loadArff("C:/Program Files/Weka-3-6/data/iris.arff"); 77 | test.selectFeatures(); 78 | test.selectFeaturesWithFilter(); 79 | test.selectFeaturesWithClassifiers(); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaFilteredClassifierTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import weka.classifiers.meta.FilteredClassifier; 4 | import weka.classifiers.trees.RandomForest; 5 | import weka.core.Instances; 6 | import weka.core.converters.ConverterUtils.DataSource; 7 | import weka.filters.unsupervised.attribute.Remove; 8 | 9 | 10 | public class WekaFilteredClassifierTest { 11 | Instances weather = null; 12 | RandomForest rf; 13 | 14 | public void loadArff(String arffInput){ 15 | DataSource source = null; 16 | try { 17 | source = new DataSource(arffInput); 18 | weather = source.getDataSet(); 19 | weather.setClassIndex(weather.numAttributes() - 1); 20 | } catch (Exception e1) { 21 | } 22 | } 23 | 24 | public void buildFilteredClassifier(){ 25 | rf = new RandomForest(); 26 | Remove rm = new Remove(); 27 | rm.setAttributeIndices("1"); 28 | FilteredClassifier fc = new FilteredClassifier(); 29 | fc.setFilter(rm); 30 | fc.setClassifier(rf); 31 | try{ 32 | fc.buildClassifier(weather); 33 | for (int i = 0; i < weather.numInstances(); i++){ 34 | double pred = fc.classifyInstance(weather.instance(i)); 35 | System.out.print("given value: " + weather.classAttribute().value((int) weather.instance(i).classValue())); 36 | System.out.println("---predicted value: " + weather.classAttribute().value((int) pred)); 37 | } 38 | } catch (Exception e) { 39 | } 40 | } 41 | 42 | public static void main(String[] args){ 43 | WekaFilteredClassifierTest test = new WekaFilteredClassifierTest(); 44 | test.loadArff("C:/Program Files/Weka-3-6/data/weather.nominal.arff"); 45 | test.buildFilteredClassifier(); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaLinearRegressionTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import weka.classifiers.functions.LinearRegression; 4 | import weka.core.Instances; 5 | import weka.core.converters.ConverterUtils.DataSource; 6 | 7 | public class WekaLinearRegressionTest { 8 | Instances cpu = null; 9 | LinearRegression lReg ; 10 | 11 | public void loadArff(String arffInput){ 12 | DataSource source = null; 13 | try { 14 | source = new DataSource(arffInput); 15 | cpu = source.getDataSet(); 16 | cpu.setClassIndex(cpu.numAttributes() - 1); 17 | } catch (Exception e1) { 18 | } 19 | } 20 | 21 | public void buildRegression(){ 22 | lReg = new LinearRegression(); 23 | try { 24 | lReg.buildClassifier(cpu); 25 | } catch (Exception e) { 26 | } 27 | System.out.println(lReg); 28 | } 29 | 30 | public static void main(String[] args) throws Exception{ 31 | WekaLinearRegressionTest test = new WekaLinearRegressionTest(); 32 | test.loadArff("C:\\Program Files\\Weka-3-6\\data\\cpu.arff"); 33 | test.buildRegression(); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaLogisticRegressionTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import weka.classifiers.functions.Logistic; 4 | import weka.core.Instances; 5 | import weka.core.converters.ConverterUtils.DataSource; 6 | 7 | public class WekaLogisticRegressionTest { 8 | Instances iris = null; 9 | Logistic logReg ; 10 | 11 | public void loadArff(String arffInput){ 12 | DataSource source = null; 13 | try { 14 | source = new DataSource(arffInput); 15 | iris = source.getDataSet(); 16 | iris.setClassIndex(iris.numAttributes() - 1); 17 | } catch (Exception e1) { 18 | } 19 | } 20 | 21 | public void buildRegression(){ 22 | logReg = new Logistic(); 23 | 24 | try { 25 | logReg.buildClassifier(iris); 26 | } catch (Exception e) { 27 | } 28 | System.out.println(logReg); 29 | } 30 | 31 | public static void main(String[] args) throws Exception{ 32 | WekaLogisticRegressionTest test = new WekaLogisticRegressionTest(); 33 | test.loadArff("C:\\Program Files\\Weka-3-6\\data\\iris.arff"); 34 | test.buildRegression(); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /Chapter04/Code/WekaTrainTest.java: -------------------------------------------------------------------------------- 1 | package chap4.java.science.data; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.FileReader; 6 | import java.io.FileWriter; 7 | import java.io.IOException; 8 | 9 | import weka.classifiers.bayes.NaiveBayes; 10 | import weka.core.Instances; 11 | 12 | public class WekaTrainTest { 13 | NaiveBayes nb; 14 | Instances train, test, labeled; 15 | 16 | public void loadModel(String modelPath){ 17 | try { 18 | nb = (NaiveBayes) weka.core.SerializationHelper.read(modelPath); 19 | } catch (Exception e) { 20 | } 21 | } 22 | 23 | public void loadDatasets(String training, String testing){ 24 | BufferedReader reader = null; 25 | try { 26 | reader = new BufferedReader(new FileReader(training)); 27 | train = new Instances (reader); 28 | train.setClassIndex(train.numAttributes() -1); 29 | } catch (IOException e) { 30 | } 31 | 32 | 33 | try { 34 | reader = new BufferedReader(new FileReader(testing)); 35 | test = new Instances (reader); 36 | test.setClassIndex(train.numAttributes() -1); 37 | } catch (IOException e) { 38 | } 39 | 40 | try { 41 | reader.close(); 42 | } catch (IOException e) { 43 | } 44 | } 45 | 46 | public void classify(){ 47 | try { 48 | nb.buildClassifier(train); 49 | } catch (Exception e) { 50 | } 51 | 52 | labeled = new Instances(test); 53 | 54 | for (int i = 0; i < test.numInstances(); i++) { 55 | double clsLabel; 56 | try { 57 | clsLabel = nb.classifyInstance(test.instance(i)); 58 | labeled.instance(i).setClassValue(clsLabel); 59 | double[] predictionOutput = nb.distributionForInstance(test.instance(i)); 60 | double predictionProbability = predictionOutput[1]; 61 | System.out.println(predictionProbability); 62 | } catch (Exception e) { 63 | } 64 | } 65 | } 66 | 67 | public void writeArff(String outArff){ 68 | BufferedWriter writer; 69 | try { 70 | writer = new BufferedWriter(new FileWriter(outArff)); 71 | writer.write(labeled.toString()); 72 | writer.close(); 73 | } catch (IOException e) { 74 | } 75 | } 76 | 77 | public static void main(String[] args) throws Exception{ 78 | WekaTrainTest test = new WekaTrainTest(); 79 | test.loadModel("c:/nb.model"); 80 | test.loadDatasets("C:\\Program Files\\Weka-3-8\\data\\iris.arff", "c:\\iris-test.arff"); 81 | test.classify(); 82 | test.writeArff("c:/out.arff"); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /Chapter05/chapter-5/JavaMachineLearning.java: -------------------------------------------------------------------------------- 1 | package chap5.java.science.data; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.Map; 6 | 7 | import net.sf.javaml.classification.Classifier; 8 | import net.sf.javaml.classification.KNearestNeighbors; 9 | import net.sf.javaml.classification.evaluation.CrossValidation; 10 | import net.sf.javaml.classification.evaluation.EvaluateDataset; 11 | import net.sf.javaml.classification.evaluation.PerformanceMeasure; 12 | import net.sf.javaml.clustering.Clusterer; 13 | import net.sf.javaml.clustering.KMeans; 14 | import net.sf.javaml.clustering.evaluation.ClusterEvaluation; 15 | import net.sf.javaml.clustering.evaluation.SumOfSquaredErrors; 16 | import net.sf.javaml.core.Dataset; 17 | import net.sf.javaml.distance.PearsonCorrelationCoefficient; 18 | import net.sf.javaml.featureselection.ranking.RecursiveFeatureEliminationSVM; 19 | import net.sf.javaml.featureselection.scoring.GainRatio; 20 | import net.sf.javaml.featureselection.subset.GreedyForwardSelection; 21 | import net.sf.javaml.tools.data.FileHandler; 22 | 23 | public class JavaMachineLearning { 24 | public static void main(String[] args) throws IOException{ 25 | Dataset data = FileHandler.loadDataset(new File("datasets/UCI-small/iris/iris.data"), 4, ","); 26 | System.out.println(data); 27 | FileHandler.exportDataset(data, new File("c:/javaml-output.txt")); 28 | data = FileHandler.loadDataset(new File("c:/javaml-output.txt"), 0,"\t"); 29 | System.out.println(data); 30 | 31 | //Clustering 32 | Clusterer km = new KMeans(); 33 | Dataset[] clusters = km.cluster(data); 34 | for(Dataset cluster:clusters){ 35 | System.out.println("Cluster: " + cluster); 36 | } 37 | ClusterEvaluation sse= new SumOfSquaredErrors(); 38 | double score = sse.score(clusters); 39 | System.out.println(score); 40 | 41 | //Classification 42 | Classifier knn = new KNearestNeighbors(5); 43 | knn.buildClassifier(data); 44 | //Cross validation 45 | CrossValidation cv = new CrossValidation(knn); 46 | Map cvEvaluation = cv.crossValidation(data); 47 | System.out.println(cvEvaluation + "---------"); 48 | //Held-out testing 49 | Dataset testData = FileHandler.loadDataset(new File("datasets/UCI-small/iris/iris.data"), 4, ","); 50 | Map testEvaluation = 51 | EvaluateDataset.testDataset(knn, testData); 52 | for(Object classVariable:testEvaluation.keySet()){ 53 | System.out.println(classVariable + " class has "+testEvaluation.get(classVariable).getAccuracy()); 54 | } 55 | 56 | //Feature scoring 57 | GainRatio gainRatio = new GainRatio(); 58 | gainRatio.build(data); 59 | for (int i = 0; i < gainRatio.noAttributes(); i++){ 60 | System.out.println(gainRatio.score(i)); 61 | } 62 | 63 | //Feature ranking 64 | RecursiveFeatureEliminationSVM featureRank = new RecursiveFeatureEliminationSVM(0.2); 65 | featureRank.build(data); 66 | for (int i = 0; i < featureRank.noAttributes(); i++){ 67 | System.out.println(featureRank.rank(i)); 68 | } 69 | 70 | //Feature subset selection 71 | GreedyForwardSelection featureSelection = new GreedyForwardSelection(5, new PearsonCorrelationCoefficient()); 72 | featureSelection.build(data); 73 | System.out.println(featureSelection.selectedAttributes()); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /Chapter05/chapter-5/MOA.java: -------------------------------------------------------------------------------- 1 | package chap5.java.science.data; 2 | 3 | import moa.classifiers.trees.HoeffdingTree; 4 | import moa.classifiers.Classifier; 5 | import moa.core.TimingUtils; 6 | import moa.streams.generators.RandomRBFGenerator; 7 | import com.yahoo.labs.samoa.instances.Instance; 8 | import java.io.IOException; 9 | 10 | 11 | public class MOA { 12 | 13 | public void run(int numInstances, boolean isTesting){ 14 | Classifier learner = new HoeffdingTree(); 15 | RandomRBFGenerator stream = new RandomRBFGenerator(); 16 | stream.prepareForUse(); 17 | 18 | learner.setModelContext(stream.getHeader()); 19 | learner.prepareForUse(); 20 | 21 | int numberSamplesCorrect = 0; 22 | int numberSamples = 0; 23 | long evaluateStartTime = TimingUtils.getNanoCPUTimeOfCurrentThread(); 24 | while (stream.hasMoreInstances() && numberSamples < numInstances) { 25 | Instance trainInst = stream.nextInstance().getData(); 26 | if (isTesting) { 27 | if (learner.correctlyClassifies(trainInst)){ 28 | numberSamplesCorrect++; 29 | } 30 | } 31 | numberSamples++; 32 | learner.trainOnInstance(trainInst); 33 | } 34 | double accuracy = 100.0 * (double) numberSamplesCorrect/ (double) numberSamples; 35 | double time = TimingUtils.nanoTimeToSeconds(TimingUtils.getNanoCPUTimeOfCurrentThread()- evaluateStartTime); 36 | System.out.println(numberSamples + " instances processed with " + accuracy + "% accuracy in "+time+" seconds."); 37 | } 38 | 39 | public static void main(String[] args) throws IOException { 40 | MOA exp = new MOA(); 41 | exp.run(1000000, true); 42 | } 43 | } -------------------------------------------------------------------------------- /Chapter05/chapter-5/Mulan.java: -------------------------------------------------------------------------------- 1 | import mulan.classifier.lazy.MLkNN; 2 | import mulan.classifier.meta.RAkEL; 3 | import mulan.classifier.transformation.LabelPowerset; 4 | import mulan.data.InvalidDataFormatException; 5 | import mulan.data.MultiLabelInstances; 6 | import mulan.evaluation.Evaluator; 7 | import mulan.evaluation.MultipleEvaluation; 8 | import weka.classifiers.trees.J48; 9 | 10 | public class Mulan { 11 | public static void main(String[] args){ 12 | MultiLabelInstances dataset = null; 13 | try { 14 | dataset = new MultiLabelInstances("F:\\mulan-1.5.0\\mulan\\data\\emotions.arff", "F:\\mulan-1.5.0\\mulan\\data\\emotions.xml"); 15 | } catch (InvalidDataFormatException e) { 16 | } 17 | RAkEL learner1 = new RAkEL(new LabelPowerset(new J48())); 18 | MLkNN learner2 = new MLkNN(); 19 | Evaluator eval = new Evaluator(); 20 | MultipleEvaluation results; 21 | int numFolds = 10; 22 | results = eval.crossValidate(learner1, dataset, numFolds); 23 | System.out.println(results); 24 | results = eval.crossValidate(learner2, dataset, numFolds); 25 | System.out.println(results); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /Chapter05/chapter-5/StanfordClassifier.java: -------------------------------------------------------------------------------- 1 | package chap5.java.science.data; 2 | 3 | import edu.stanford.nlp.classify.Classifier; 4 | import edu.stanford.nlp.classify.ColumnDataClassifier; 5 | import edu.stanford.nlp.ling.Datum; 6 | import edu.stanford.nlp.objectbank.ObjectBank; 7 | 8 | public class StanfordClassifier { 9 | public static void main(String[] args) throws Exception { 10 | ColumnDataClassifier columnDataClassifier = new ColumnDataClassifier("examples/cheese2007.prop"); 11 | Classifier classifier = 12 | columnDataClassifier.makeClassifier(columnDataClassifier.readTrainingExamples("examples/cheeseDisease.train")); 13 | for (String line : ObjectBank.getLineIterator("examples/cheeseDisease.test", "utf-8")) { 14 | Datum d = columnDataClassifier.makeDatumFromLine(line); 15 | System.out.println(line + " ==> " + classifier.classOf(d)); 16 | } 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /Chapter06/CosineSimilarity.java: -------------------------------------------------------------------------------- 1 | package chap6.java.science.data; 2 | 3 | import java.util.HashSet; 4 | import java.util.Map; 5 | import java.util.Set; 6 | import java.util.stream.Collectors; 7 | import java.util.stream.Stream; 8 | 9 | public class CosineSimilarity { 10 | public double calculateCosine(String s1, String s2){ 11 | //tokenization in parallel with Java 8 12 | Stream stream1 = Stream.of(s1.toLowerCase().split("\\W+")).parallel(); 13 | Stream stream2 = Stream.of(s2.toLowerCase().split("\\W+")).parallel(); 14 | 15 | //word frequency maps for two strings 16 | Map wordFreq1 = stream1 17 | .collect(Collectors.groupingBy(String::toString,Collectors.counting())); 18 | Map wordFreq2 = stream2 19 | .collect(Collectors.groupingBy(String::toString,Collectors.counting())); 20 | 21 | //unique words for each string 22 | Set wordSet1 = wordFreq1.keySet(); 23 | Set wordSet2 = wordFreq2.keySet(); 24 | 25 | //common words of two strings 26 | Set intersection = new HashSet(wordSet1); 27 | intersection.retainAll(wordSet2); 28 | 29 | //numerator of cosine formula. s1.s2 30 | double numerator = 0; 31 | for (String common: intersection){ 32 | numerator += wordFreq1.get(common) * wordFreq2.get(common); 33 | } 34 | 35 | //denominator of cosine formula has two parameters 36 | double param1 = 0, param2 = 0; 37 | 38 | //sqrt (sum of squared of s1 word frequencies) 39 | for(String w1: wordSet1){ 40 | param1 += Math.pow(wordFreq1.get(w1), 2); 41 | } 42 | param1 = Math.sqrt(param1); 43 | 44 | //sqrt (sum of squared of s2 word frequencies) 45 | for(String w2: wordSet2){ 46 | param2 += Math.pow(wordFreq2.get(w2), 2); 47 | } 48 | param2 = Math.sqrt(param2); 49 | 50 | //denominator of cosine formula. sqrt(sum(s1^2)) X sqrt(sum(s2^2)) 51 | double denominator = param1 * param2; 52 | 53 | //cosine measure 54 | double cosineSimilarity = numerator/denominator; 55 | return cosineSimilarity; 56 | }//end method to calculate cosine similarity of two strings 57 | 58 | public static void main(String[] args){ 59 | CosineSimilarity cos = new CosineSimilarity(); 60 | System.out.println(cos.calculateCosine("To be, or not to be: that is the question.", "Frailty, thy name is woman!")); 61 | System.out.println(cos.calculateCosine("The lady doth protest too much, methinks.", "Frailty, thy name is woman!")); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /Chapter06/Lemmatizer.java: -------------------------------------------------------------------------------- 1 | package chap6.java.science.data; 2 | 3 | import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; 5 | import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; 6 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 7 | import edu.stanford.nlp.ling.CoreLabel; 8 | import edu.stanford.nlp.pipeline.Annotation; 9 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 10 | import edu.stanford.nlp.util.CoreMap; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.Properties; 14 | /** 15 | * Class to perform lemmatization using Stanford Core NLP 16 | * @author Themistoklis Mavridis 17 | */ 18 | public class Lemmatizer { 19 | 20 | 21 | public static void main(String[] args) 22 | { 23 | StanfordCoreNLP pipeline; 24 | Properties props = new Properties(); 25 | props.put("annotators", "tokenize, ssplit, pos, lemma"); 26 | pipeline = new StanfordCoreNLP(props, false); 27 | String text = "This is a test string"; 28 | Annotation document = pipeline.process(text); 29 | 30 | for(CoreMap sentence: document.get(SentencesAnnotation.class)) 31 | { 32 | for(CoreLabel token: sentence.get(TokensAnnotation.class)) 33 | { 34 | String word = token.get(TextAnnotation.class); 35 | String lemma = token.get(LemmaAnnotation.class); 36 | System.out.println("lemmatized version :" + lemma); 37 | } 38 | } 39 | } 40 | } -------------------------------------------------------------------------------- /Chapter06/OpenNlpSenToken.java: -------------------------------------------------------------------------------- 1 | package chap6.java.science.data; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | 7 | import opennlp.tools.sentdetect.SentenceDetectorME; 8 | import opennlp.tools.sentdetect.SentenceModel; 9 | import opennlp.tools.tokenize.Tokenizer; 10 | import opennlp.tools.tokenize.TokenizerME; 11 | import opennlp.tools.tokenize.TokenizerModel; 12 | 13 | public class OpenNlpSenToken { 14 | public static void main(String[] args){ 15 | OpenNlpSenToken openNlp = new OpenNlpSenToken(); 16 | try { 17 | openNlp.useOpenNlp("My name is Rushdi Shams. " 18 | + "You can use Dr. before my name as I have a Ph.D. " 19 | + "but I am a bit shy to use it.", "opennlp-models/en-sent.bin", "sentence"); 20 | openNlp.useOpenNlp("\"Let's get this vis-a-vis\", he said, \"these boys' marks are really that well?\"", "opennlp-models/en-token.bin", "word"); 21 | } catch (IOException e) { 22 | } 23 | } 24 | public void useOpenNlp(String sourceText, String modelPath, String choice) throws IOException{ 25 | InputStream modelIn = null; 26 | modelIn = new FileInputStream(modelPath); 27 | 28 | if(choice.equalsIgnoreCase("sentence")){ 29 | SentenceModel model = new SentenceModel(modelIn); 30 | modelIn.close(); 31 | SentenceDetectorME sentenceDetector = new SentenceDetectorME(model); 32 | String sentences[] = sentenceDetector.sentDetect(sourceText); 33 | System.out.println("Sentences: "); 34 | for(String sentence:sentences){ 35 | System.out.println(sentence); 36 | } 37 | } 38 | else if(choice.equalsIgnoreCase("word")){ 39 | TokenizerModel model = new TokenizerModel(modelIn); 40 | modelIn.close(); 41 | Tokenizer tokenizer = new TokenizerME(model); 42 | String tokens[] = tokenizer.tokenize(sourceText); 43 | System.out.println("Words: "); 44 | for(String token:tokens){ 45 | System.out.println(token); 46 | } 47 | } 48 | else{ 49 | System.out.println("Error in choice"); 50 | modelIn.close(); 51 | return; 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /Chapter06/SentenceDetection.java: -------------------------------------------------------------------------------- 1 | package chap6.java.science.data; 2 | 3 | import java.text.BreakIterator; 4 | import java.util.Locale; 5 | 6 | public class SentenceDetection { 7 | public void useSentenceIterator(String source){ 8 | BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); 9 | iterator.setText(source); 10 | int start = iterator.first(); 11 | for (int end = iterator.next(); 12 | end != BreakIterator.DONE; 13 | start = end, end = iterator.next()) { 14 | System.out.println(source.substring(start,end)); 15 | } 16 | } 17 | public static void main(String[] args){ 18 | SentenceDetection detection = new SentenceDetection(); 19 | String test = "My name is Rushdi Shams. You can use Dr. before my name as I have a Ph.D. but I am a bit shy to use it."; 20 | detection.useSentenceIterator(test); 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /Chapter06/WekaClassification.java: -------------------------------------------------------------------------------- 1 | package chap6.java.science.data; 2 | 3 | import weka.core.*; 4 | import weka.core.converters.*; 5 | import weka.classifiers.Evaluation; 6 | import weka.classifiers.bayes.NaiveBayes; 7 | import weka.filters.*; 8 | import weka.filters.unsupervised.attribute.*; 9 | 10 | import java.io.*; 11 | import java.util.Random; 12 | 13 | public class WekaClassification { 14 | public static void main(String[] args) throws Exception { 15 | TextDirectoryLoader loader = new TextDirectoryLoader(); 16 | loader.setDirectory(new File("C:/text_example")); 17 | Instances data = loader.getDataSet(); 18 | 19 | StringToWordVector filter = new StringToWordVector(); 20 | filter.setInputFormat(data); 21 | Instances dataFiltered = Filter.useFilter(data, filter); 22 | 23 | NaiveBayes nb = new NaiveBayes(); 24 | nb.buildClassifier(dataFiltered); 25 | System.out.println("\n\nClassifier model:\n\n" + nb); 26 | 27 | Evaluation eval = null; 28 | eval = new Evaluation(dataFiltered); 29 | eval.crossValidateModel(nb, dataFiltered, 5, new Random(1)); 30 | System.out.println(eval.toSummaryString()); 31 | } 32 | } -------------------------------------------------------------------------------- /Chapter06/WordDetection.java: -------------------------------------------------------------------------------- 1 | package chap6.java.science.data; 2 | 3 | import java.text.BreakIterator; 4 | import java.util.StringTokenizer; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | public class WordDetection { 9 | public static void main(String[] args){ 10 | String input = "\"Let's get this vis-a-vis\", he said, \"these boys' marks are really that well?\""; 11 | WordDetection wordDetection = new WordDetection(); 12 | wordDetection.useTokenizer(input); 13 | wordDetection.useBreakIterator(input); 14 | wordDetection.useRegEx(input); 15 | 16 | } 17 | 18 | public void useTokenizer(String input){ 19 | System.out.println("Tokenizer"); 20 | StringTokenizer tokenizer = new StringTokenizer(input); 21 | String word =""; 22 | while(tokenizer.hasMoreTokens()){ 23 | word = tokenizer.nextToken(); 24 | System.out.println(word); 25 | } 26 | } 27 | 28 | public void useBreakIterator(String input){ 29 | System.out.println("Break Iterator"); 30 | BreakIterator tokenizer = BreakIterator.getWordInstance(); 31 | tokenizer.setText(input); 32 | int start = tokenizer.first(); 33 | for (int end = tokenizer.next(); 34 | end != BreakIterator.DONE; 35 | start = end, end = tokenizer.next()) { 36 | System.out.println(input.substring(start,end)); 37 | } 38 | } 39 | 40 | public void useRegEx(String input){ 41 | System.out.println("Regular Expression"); 42 | Pattern pattern = Pattern.compile("\\w[\\w-]+('\\w*)?"); 43 | Matcher matcher = pattern.matcher(input); 44 | 45 | while ( matcher.find() ) { 46 | System.out.println(input.substring(matcher.start(), matcher.end())); 47 | } 48 | } 49 | } 50 | 51 | -------------------------------------------------------------------------------- /Chapter07/Code/KMeansClusteringMlib.java: -------------------------------------------------------------------------------- 1 | package com.data.big.mlib; 2 | 3 | import org.apache.spark.api.java.*; 4 | import org.apache.spark.api.java.function.Function; 5 | import org.apache.spark.mllib.clustering.KMeans; 6 | import org.apache.spark.mllib.clustering.KMeansModel; 7 | import org.apache.spark.mllib.linalg.Vector; 8 | import org.apache.spark.mllib.linalg.Vectors; 9 | import org.apache.spark.SparkConf; 10 | 11 | public class KMeansClusteringMlib { 12 | public static void main( String[] args ){ 13 | SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("K-means Example"); 14 | JavaSparkContext sc = new JavaSparkContext(conf); 15 | 16 | // Load and parse data 17 | String path = "data/km-data.txt"; 18 | JavaRDD data = sc.textFile(path); 19 | JavaRDD parsedData = data.map( 20 | new Function() { 21 | public Vector call(String s) { 22 | String[] sarray = s.split(" "); 23 | double[] values = new double[sarray.length]; 24 | for (int i = 0; i < sarray.length; i++) 25 | values[i] = Double.parseDouble(sarray[i]); 26 | return Vectors.dense(values); 27 | } 28 | } 29 | ); 30 | parsedData.cache(); 31 | 32 | // Cluster the data into two classes using KMeans 33 | int numClusters = 2; 34 | int numIterations = 20; 35 | KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); 36 | 37 | // Evaluate clustering by computing Within Set Sum of Squared Errors 38 | double WSSSE = clusters.computeCost(parsedData.rdd()); 39 | System.out.println("Within Set Sum of Squared Errors = " + WSSSE); 40 | 41 | 42 | 43 | } 44 | } -------------------------------------------------------------------------------- /Chapter07/Code/LinearRegressionMlib.java: -------------------------------------------------------------------------------- 1 | package com.data.big.mlib; 2 | 3 | import scala.Tuple2; 4 | 5 | import org.apache.spark.api.java.*; 6 | import org.apache.spark.api.java.function.Function; 7 | import org.apache.spark.mllib.linalg.Vectors; 8 | import org.apache.spark.mllib.regression.LabeledPoint; 9 | import org.apache.spark.mllib.regression.LinearRegressionModel; 10 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD; 11 | import org.apache.spark.SparkConf; 12 | 13 | public class LinearRegressionMlib { 14 | 15 | public static void main(String[] args) { 16 | SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Linear Regression Example"); 17 | JavaSparkContext sparkContext = new JavaSparkContext(configuration); 18 | 19 | // Load and parse the data 20 | String inputData = "data/lr-data.txt"; 21 | JavaRDD data = sparkContext.textFile(inputData); 22 | JavaRDD parsedData = data.map( 23 | new Function() { 24 | public LabeledPoint call(String line) { 25 | String[] parts = line.split(","); 26 | String[] features = parts[1].split(" "); 27 | double[] featureVector = new double[features.length]; 28 | for (int i = 0; i < features.length - 1; i++){ 29 | featureVector[i] = Double.parseDouble(features[i]); 30 | } 31 | return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(featureVector)); 32 | } 33 | } 34 | ); 35 | parsedData.cache(); 36 | 37 | // Building the model 38 | int numIterations = 100; 39 | final LinearRegressionModel model = 40 | LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations); 41 | 42 | // Evaluate model on training examples and compute training error 43 | JavaRDD> valuesAndPreds = parsedData.map( 44 | new Function>() { 45 | public Tuple2 call(LabeledPoint point) { 46 | double prediction = model.predict(point.features()); 47 | return new Tuple2(prediction, point.label()); 48 | } 49 | } 50 | ); 51 | double MSE = new JavaDoubleRDD(valuesAndPreds.map( 52 | new Function, Object>() { 53 | public Object call(Tuple2 pair) { 54 | return Math.pow(pair._1() - pair._2(), 2.0); 55 | } 56 | } 57 | ).rdd()).mean(); 58 | System.out.println("training Mean Squared Error = " + MSE); 59 | } 60 | } -------------------------------------------------------------------------------- /Chapter07/Code/OnlineLogisticRegressionTest.java: -------------------------------------------------------------------------------- 1 | package chapter4.src.logistic; 2 | 3 | import com.google.common.base.Charsets; 4 | import com.google.common.io.Resources; 5 | 6 | import org.apache.mahout.math.Matrix; 7 | import org.apache.mahout.math.SequentialAccessSparseVector; 8 | import org.apache.mahout.math.Vector; 9 | import org.apache.mahout.classifier.evaluation.Auc; 10 | import org.apache.mahout.classifier.sgd.CsvRecordFactory; 11 | import org.apache.mahout.classifier.sgd.LogisticModelParameters; 12 | import org.apache.mahout.classifier.sgd.OnlineLogisticRegression; 13 | 14 | import java.io.BufferedReader; 15 | import java.io.File; 16 | import java.io.FileInputStream; 17 | import java.io.IOException; 18 | import java.io.InputStream; 19 | import java.io.InputStreamReader; 20 | import java.io.OutputStreamWriter; 21 | import java.io.PrintWriter; 22 | import java.util.Locale; 23 | 24 | public class OnlineLogisticRegressionTest { 25 | 26 | private static String inputFile="data/weather.numeric.test.csv"; 27 | private static String modelFile="model/model"; 28 | private static boolean showAuc; 29 | private static boolean showScores; 30 | private static boolean showConfusion; 31 | static BufferedReader open(String inputFile) throws IOException { 32 | InputStream in; 33 | try { 34 | in = Resources.getResource(inputFile).openStream(); 35 | } catch (IllegalArgumentException e) { 36 | in = new FileInputStream(new File(inputFile)); 37 | } 38 | return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8)); 39 | } 40 | 41 | public static void main(String[] args) throws Exception { 42 | showAuc = true; 43 | showConfusion = true; 44 | Auc collector = new Auc(); 45 | LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile)); 46 | CsvRecordFactory csv = lmp.getCsvRecordFactory(); 47 | OnlineLogisticRegression lr = lmp.createRegression(); 48 | BufferedReader in = OnlineLogisticRegressionTest.open(inputFile); 49 | String line = in.readLine(); 50 | csv.firstLine(line); 51 | line = in.readLine(); 52 | PrintWriter output=new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true); 53 | output.println("\"target\",\"model-output\",\"log-likelihood\""); 54 | while (line != null) { 55 | System.out.println("-----" + line); 56 | Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures()); 57 | int target = csv.processLine(line, v); 58 | double score = lr.classifyScalarNoLink(v); 59 | output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v)); 60 | collector.add(target, score); 61 | line = in.readLine(); 62 | System.out.println("I am here"); 63 | } 64 | output.printf(Locale.ENGLISH, "AUC = %.2f%n", collector.auc()); 65 | Matrix m = collector.confusion(); 66 | output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n", 67 | m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); 68 | m = collector.entropy(); 69 | output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n", 70 | m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1)); 71 | } 72 | 73 | } -------------------------------------------------------------------------------- /Chapter07/Code/OnlineLogisticRegressionTrain.java: -------------------------------------------------------------------------------- 1 | package chapter4.src.logistic; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileOutputStream; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.OutputStream; 8 | import java.io.OutputStreamWriter; 9 | import java.io.PrintWriter; 10 | import java.util.Arrays; 11 | import java.util.List; 12 | import java.util.Locale; 13 | 14 | import org.apache.mahout.classifier.sgd.CsvRecordFactory; 15 | import org.apache.mahout.classifier.sgd.LogisticModelParameters; 16 | import org.apache.mahout.classifier.sgd.OnlineLogisticRegression; 17 | import org.apache.mahout.classifier.sgd.RecordFactory; 18 | import org.apache.mahout.math.RandomAccessSparseVector; 19 | import org.apache.mahout.math.Vector; 20 | 21 | import com.google.common.base.Charsets; 22 | 23 | 24 | public class OnlineLogisticRegressionTrain { 25 | private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) { 26 | double weight = 0; 27 | for (Integer column : csv.getTraceDictionary().get(predictor)) { 28 | weight += lr.getBeta().get(row, column); 29 | } 30 | return weight; 31 | } 32 | public static void main(String[] args) throws IOException 33 | { 34 | String inputFile = "data/weather.numeric.csv"; 35 | String outputFile = "model/model"; 36 | 37 | /* List predictorList =Arrays.asList("age","job","marital","education","default", 38 | "housing","loan","contact","month","day_of_week","duration","campaign","pdays","previous","poutcome", 39 | "emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed"); 40 | List typeList = Arrays.asList("n", "w", "w", "w", "w", "w", "w", "w", "w", "w", "n", "n", "n", "n", 41 | "w", "n", "n", "n", "n", "n");*/ 42 | 43 | /*List predictorList =Arrays.asList("sepallength", "sepalwidth", "petallength", "petalwidth", "class"); 44 | List typeList = Arrays.asList("n", "n", "n", "n", "w");*/ 45 | List predictorList =Arrays.asList("outlook", "temperature", "humidity", "windy", "play"); 46 | List typeList = Arrays.asList("w", "n", "n", "w", "w"); 47 | LogisticModelParameters lmp = new LogisticModelParameters(); 48 | lmp.setTargetVariable("play"); 49 | lmp.setMaxTargetCategories(2); 50 | lmp.setNumFeatures(4); 51 | lmp.setUseBias(false); 52 | lmp.setTypeMap(predictorList,typeList); 53 | lmp.setLearningRate(0.5); 54 | 55 | 56 | int passes = 50; 57 | OnlineLogisticRegression lr; 58 | 59 | CsvRecordFactory csv = lmp.getCsvRecordFactory(); 60 | lr = lmp.createRegression(); 61 | 62 | 63 | int k = 0; 64 | 65 | for (int pass = 0; pass < passes; pass++) { 66 | BufferedReader in = new BufferedReader(new FileReader(inputFile)); 67 | 68 | csv.firstLine(in.readLine()); 69 | 70 | String line = in.readLine(); 71 | System.out.println(line); 72 | int lineCount = 2; 73 | while (line != null) { 74 | System.out.println("line " + lineCount); 75 | System.out.println(lmp.getNumFeatures()); 76 | Vector input = new RandomAccessSparseVector(lmp.getNumFeatures()); 77 | int targetValue = csv.processLine(line, input); 78 | 79 | // update model 80 | lr.train(targetValue, input); 81 | k++; 82 | 83 | line = in.readLine(); 84 | lineCount++; 85 | } 86 | in.close(); 87 | } 88 | 89 | OutputStream modelOutput = new FileOutputStream(outputFile); 90 | try { 91 | lmp.saveTo(modelOutput); 92 | } finally { 93 | modelOutput.close(); 94 | } 95 | PrintWriter output=new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true); 96 | output.println(lmp.getNumFeatures()); 97 | output.println(lmp.getTargetVariable() + " ~ "); 98 | String sep = ""; 99 | for (String v : csv.getTraceDictionary().keySet()) { 100 | double weight = predictorWeight(lr, 0, csv, v); 101 | if (weight != 0) { 102 | output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v); 103 | sep = " + "; 104 | } 105 | } 106 | output.printf("%n"); 107 | for (int row = 0; row < lr.getBeta().numRows(); row++) { 108 | for (String key : csv.getTraceDictionary().keySet()) { 109 | double weight = predictorWeight(lr, row, csv, key); 110 | if (weight != 0) { 111 | output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight); 112 | } 113 | } 114 | for (int column = 0; column < lr.getBeta().numCols(); column++) { 115 | output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column)); 116 | } 117 | output.println(); 118 | } 119 | } 120 | } -------------------------------------------------------------------------------- /Chapter07/Code/RandomForestMlib.java: -------------------------------------------------------------------------------- 1 | package com.data.big.mlib; 2 | 3 | import scala.Tuple2; 4 | import java.util.HashMap; 5 | import org.apache.spark.SparkConf; 6 | import org.apache.spark.api.java.JavaPairRDD; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | import org.apache.spark.api.java.function.Function; 10 | import org.apache.spark.api.java.function.PairFunction; 11 | import org.apache.spark.mllib.regression.LabeledPoint; 12 | import org.apache.spark.mllib.tree.RandomForest; 13 | import org.apache.spark.mllib.tree.model.RandomForestModel; 14 | import org.apache.spark.mllib.util.MLUtils; 15 | 16 | public class RandomForestMlib { 17 | public static void main(String args[]){ 18 | 19 | SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any"); 20 | JavaSparkContext sc = new JavaSparkContext(configuration); 21 | 22 | // Load and parse the data file. 23 | String input = "data/rf-data.txt"; 24 | JavaRDD data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD(); 25 | // Split the data into training and test sets (30% held out for testing) 26 | JavaRDD[] dataSplits = data.randomSplit(new double[]{0.7, 0.3}); 27 | JavaRDD trainingData = dataSplits[0]; 28 | JavaRDD testData = dataSplits[1]; 29 | 30 | // Train a RandomForest model. 31 | Integer numClasses = 2; 32 | HashMap categoricalFeaturesInfo = new HashMap();// Empty categoricalFeaturesInfo indicates all features are continuous. 33 | Integer numTrees = 3; // Use more in practice. 34 | String featureSubsetStrategy = "auto"; // Let the algorithm choose. 35 | String impurity = "gini"; 36 | Integer maxDepth = 5; 37 | Integer maxBins = 32; 38 | Integer seed = 12345; 39 | 40 | final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses, 41 | categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, 42 | seed); 43 | 44 | // Evaluate model on test instances and compute test error 45 | JavaPairRDD label = 46 | testData.mapToPair(new PairFunction() { 47 | public Tuple2 call(LabeledPoint p) { 48 | return new Tuple2(rfModel.predict(p.features()), p.label()); 49 | } 50 | }); 51 | 52 | Double testError = 53 | 1.0 * label.filter(new Function, Boolean>() { 54 | public Boolean call(Tuple2 pl) { 55 | return !pl._1().equals(pl._2()); 56 | } 57 | }).count() / testData.count(); 58 | 59 | System.out.println("Test Error: " + testError); 60 | System.out.println("Learned classification forest model:\n" + rfModel.toDebugString()); 61 | } 62 | } -------------------------------------------------------------------------------- /Chapter07/Code/ScalaTest.java: -------------------------------------------------------------------------------- 1 | package com.data.big.mlib; 2 | 3 | import org.apache.spark.SparkConf; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.api.java.function.Function; 7 | 8 | 9 | public class ScalaTest { 10 | public static void main( String[] args ){ 11 | String inputFile = "data/dummy.txt"; 12 | SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App"); 13 | JavaSparkContext sparkContext = new JavaSparkContext(configuration); 14 | JavaRDD logData = sparkContext.textFile(inputFile).cache(); 15 | 16 | long numberA = logData.filter(new Function(){ 17 | private static final long serialVersionUID = 1L; 18 | public Boolean call(String s){ 19 | return s.length() == 0; 20 | } 21 | }).count(); 22 | sparkContext.close(); 23 | System.out.println("Empty Lines: " + numberA); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /Chapter08/Chap-08-Code.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Java-Data-Science-Cookbook/b2bf9ef367bf6c04a96e24123e4160b733b7fed9/Chapter08/Chap-08-Code.rar -------------------------------------------------------------------------------- /Chapter08/Chap-08-Code/Code/DBNIrisExample.java: -------------------------------------------------------------------------------- 1 | package deepbelief.chap8.science.data; 2 | 3 | 4 | import org.deeplearning4j.datasets.iterator.DataSetIterator; 5 | import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator; 6 | import org.deeplearning4j.eval.Evaluation; 7 | import org.deeplearning4j.nn.api.OptimizationAlgorithm; 8 | import org.deeplearning4j.nn.conf.MultiLayerConfiguration; 9 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration; 10 | import org.deeplearning4j.nn.conf.Updater; 11 | import org.deeplearning4j.nn.conf.layers.OutputLayer; 12 | import org.deeplearning4j.nn.conf.layers.RBM; 13 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; 14 | import org.deeplearning4j.nn.params.DefaultParamInitializer; 15 | import org.deeplearning4j.nn.weights.WeightInit; 16 | import org.deeplearning4j.optimize.api.IterationListener; 17 | import org.deeplearning4j.optimize.listeners.ScoreIterationListener; 18 | import org.nd4j.linalg.api.ndarray.INDArray; 19 | import org.nd4j.linalg.dataset.DataSet; 20 | import org.nd4j.linalg.dataset.SplitTestAndTrain; 21 | import org.nd4j.linalg.factory.Nd4j; 22 | import org.nd4j.linalg.lossfunctions.LossFunctions; 23 | import org.slf4j.Logger; 24 | import org.slf4j.LoggerFactory; 25 | 26 | 27 | import java.util.Arrays; 28 | import java.util.Random; 29 | 30 | public class DBNIrisExample { 31 | 32 | private static Logger log = LoggerFactory.getLogger(DBNIrisExample.class); 33 | 34 | public static void main(String[] args) throws Exception { 35 | // Customizing params 36 | Nd4j.MAX_SLICES_TO_PRINT = -1; 37 | Nd4j.MAX_ELEMENTS_PER_SLICE = -1; 38 | 39 | final int numRows = 4; 40 | final int numColumns = 1; 41 | int outputNum = 3; 42 | int numSamples = 150; 43 | int batchSize = 150; 44 | int iterations = 5; 45 | int splitTrainNum = (int) (batchSize * .8); 46 | int seed = 123; 47 | int listenerFreq = 1; 48 | 49 | log.info("Load data...."); 50 | DataSetIterator iter = new IrisDataSetIterator(batchSize, numSamples); 51 | DataSet next = iter.next(); 52 | next.normalizeZeroMeanZeroUnitVariance(); 53 | 54 | log.info("Split data...."); 55 | SplitTestAndTrain testAndTrain = next.splitTestAndTrain(splitTrainNum, new Random(seed)); 56 | DataSet train = testAndTrain.getTrain(); 57 | DataSet test = testAndTrain.getTest(); 58 | Nd4j.ENFORCE_NUMERICAL_STABILITY = true; 59 | 60 | log.info("Build model...."); 61 | MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() 62 | .seed(seed) // Locks in weight initialization for tuning 63 | .iterations(iterations) // # training iterations predict/classify & backprop 64 | .learningRate(1e-6f) // Optimization step size 65 | .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT) // Backprop to calculate gradients 66 | .l1(1e-1).regularization(true).l2(2e-4) 67 | .useDropConnect(true) 68 | .list(2) // # NN layers (doesn't count input layer) 69 | .layer(0, new RBM.Builder(RBM.HiddenUnit.RECTIFIED, RBM.VisibleUnit.GAUSSIAN) 70 | .nIn(numRows * numColumns) // # input nodes 71 | .nOut(3) // # fully connected hidden layer nodes. Add list if multiple layers. 72 | .weightInit(WeightInit.XAVIER) // Weight initialization 73 | .k(1) // # contrastive divergence iterations 74 | .activation("relu") // Activation function type 75 | .lossFunction(LossFunctions.LossFunction.RMSE_XENT) // Loss function type 76 | .updater(Updater.ADAGRAD) 77 | .dropOut(0.5) 78 | .build() 79 | ) // NN layer type 80 | .layer(1, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT) 81 | .nIn(3) // # input nodes 82 | .nOut(outputNum) // # output nodes 83 | .activation("softmax") 84 | .build() 85 | ) // NN layer type 86 | .build(); 87 | MultiLayerNetwork model = new MultiLayerNetwork(conf); 88 | model.init(); 89 | // model.setListeners(Arrays.asList(new ScoreIterationListener(listenerFreq), 90 | // new GradientPlotterIterationListener(listenerFreq), 91 | // new LossPlotterIterationListener(listenerFreq))); 92 | 93 | 94 | model.setListeners(Arrays.asList((IterationListener) new ScoreIterationListener(listenerFreq))); 95 | log.info("Train model...."); 96 | model.fit(train); 97 | 98 | log.info("Evaluate weights...."); 99 | for(org.deeplearning4j.nn.api.Layer layer : model.getLayers()) { 100 | INDArray w = layer.getParam(DefaultParamInitializer.WEIGHT_KEY); 101 | log.info("Weights: " + w); 102 | } 103 | 104 | log.info("Evaluate model...."); 105 | Evaluation eval = new Evaluation(outputNum); 106 | INDArray output = model.output(test.getFeatureMatrix()); 107 | 108 | for (int i = 0; i < output.rows(); i++) { 109 | String actual = test.getLabels().getRow(i).toString().trim(); 110 | String predicted = output.getRow(i).toString().trim(); 111 | log.info("actual " + actual + " vs predicted " + predicted); 112 | } 113 | 114 | eval.eval(test.getLabels(), output); 115 | log.info(eval.stats()); 116 | log.info("****************Example finished********************"); 117 | 118 | 119 | /* OutputStream fos = Files.newOutputStream(Paths.get("coefficients.bin")); 120 | DataOutputStream dos = new DataOutputStream(fos); 121 | Nd4j.write(model.params(), dos); 122 | dos.flush(); 123 | dos.close(); 124 | FileUtils.writeStringToFile(new File("conf.json"), model.getLayerWiseConfigurations().toJson()); 125 | 126 | MultiLayerConfiguration confFromJson = MultiLayerConfiguration.fromJson(FileUtils.readFileToString(new File("conf.json"))); 127 | DataInputStream dis = new DataInputStream(new FileInputStream("coefficients.bin")); 128 | INDArray newParams = Nd4j.read(dis); 129 | dis.close(); 130 | MultiLayerNetwork savedNetwork = new MultiLayerNetwork(confFromJson); 131 | savedNetwork.init(); 132 | savedNetwork.setParams(newParams); 133 | System.out.println("Original network params " + model.params()); 134 | System.out.println(savedNetwork.params()); 135 | 136 | 137 | */ 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /Chapter08/Chap-08-Code/Code/DeepAutoEncoderExample.java: -------------------------------------------------------------------------------- 1 | package deepbelief.chap8.science.data; 2 | import org.deeplearning4j.datasets.fetchers.MnistDataFetcher; 3 | import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator; 4 | import org.deeplearning4j.nn.api.OptimizationAlgorithm; 5 | import org.deeplearning4j.nn.conf.MultiLayerConfiguration; 6 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration; 7 | import org.deeplearning4j.nn.conf.layers.OutputLayer; 8 | import org.deeplearning4j.nn.conf.layers.RBM; 9 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; 10 | import org.deeplearning4j.optimize.api.IterationListener; 11 | import org.deeplearning4j.optimize.listeners.ScoreIterationListener; 12 | import org.nd4j.linalg.dataset.DataSet; 13 | import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; 14 | import org.nd4j.linalg.lossfunctions.LossFunctions; 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | import java.util.Arrays; 19 | 20 | /** 21 | * @author Adam Gibson 22 | */ 23 | public class DeepAutoEncoderExample { 24 | 25 | private static Logger log = LoggerFactory.getLogger(DeepAutoEncoderExample.class); 26 | 27 | public static void main(String[] args) throws Exception { 28 | final int numRows = 28; 29 | final int numColumns = 28; 30 | int seed = 123; 31 | int numSamples = MnistDataFetcher.NUM_EXAMPLES; 32 | int batchSize = 1000; 33 | int iterations = 1; 34 | int listenerFreq = iterations/5; 35 | 36 | log.info("Load data...."); 37 | DataSetIterator iter = new MnistDataSetIterator(batchSize,numSamples,true); 38 | 39 | log.info("Build model...."); 40 | MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() 41 | .seed(seed) 42 | .iterations(iterations) 43 | .optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT) 44 | .list(10) 45 | .layer(0, new RBM.Builder().nIn(numRows * numColumns).nOut(1000).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 46 | .layer(1, new RBM.Builder().nIn(1000).nOut(500).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 47 | .layer(2, new RBM.Builder().nIn(500).nOut(250).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 48 | .layer(3, new RBM.Builder().nIn(250).nOut(100).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 49 | .layer(4, new RBM.Builder().nIn(100).nOut(30).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) //encoding stops 50 | .layer(5, new RBM.Builder().nIn(30).nOut(100).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) //decoding starts 51 | .layer(6, new RBM.Builder().nIn(100).nOut(250).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 52 | .layer(7, new RBM.Builder().nIn(250).nOut(500).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 53 | .layer(8, new RBM.Builder().nIn(500).nOut(1000).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) 54 | .layer(9, new OutputLayer.Builder(LossFunctions.LossFunction.RMSE_XENT).nIn(1000).nOut(numRows*numColumns).build()) 55 | .pretrain(true).backprop(true) 56 | .build(); 57 | 58 | MultiLayerNetwork model = new MultiLayerNetwork(conf); 59 | model.init(); 60 | 61 | model.setListeners(Arrays.asList((IterationListener) new ScoreIterationListener(listenerFreq))); 62 | 63 | log.info("Train model...."); 64 | while(iter.hasNext()) { 65 | DataSet next = iter.next(); 66 | model.fit(new DataSet(next.getFeatureMatrix(),next.getFeatureMatrix())); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /Chapter08/Chap-08-Code/Code/Word2VecRawTextExample.java: -------------------------------------------------------------------------------- 1 | package word2vec.chap8.science.data; 2 | 3 | import org.deeplearning4j.models.embeddings.WeightLookupTable; 4 | import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable; 5 | import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer; 6 | import org.deeplearning4j.models.word2vec.Word2Vec; 7 | import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache; 8 | import org.deeplearning4j.text.sentenceiterator.SentenceIterator; 9 | import org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator; 10 | import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor; 11 | import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory; 12 | import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory; 13 | 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import java.util.ArrayList; 18 | import java.util.Collection; 19 | 20 | public class Word2VecRawTextExample { 21 | 22 | private static Logger log = LoggerFactory.getLogger(Word2VecRawTextExample.class); 23 | 24 | public static void main(String[] args) throws Exception { 25 | 26 | // Gets Path to Text file 27 | String filePath = "c:/raw_sentences.txt"; 28 | 29 | log.info("Load & Vectorize Sentences...."); 30 | // Strip white space before and after for each line 31 | SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath); 32 | // Split on white spaces in the line to get words 33 | TokenizerFactory t = new DefaultTokenizerFactory(); 34 | t.setTokenPreProcessor(new CommonPreprocessor()); 35 | 36 | InMemoryLookupCache cache = new InMemoryLookupCache(); 37 | WeightLookupTable table = new InMemoryLookupTable.Builder() 38 | .vectorLength(100) 39 | .useAdaGrad(false) 40 | .cache(cache) 41 | .lr(0.025f).build(); 42 | 43 | log.info("Building model...."); 44 | Word2Vec vec = new Word2Vec.Builder() 45 | .minWordFrequency(5).iterations(1) 46 | .layerSize(100).lookupTable(table) 47 | .stopWords(new ArrayList()) 48 | .vocabCache(cache).seed(42) 49 | .windowSize(5).iterate(iter).tokenizerFactory(t).build(); 50 | 51 | log.info("Fitting Word2Vec model...."); 52 | vec.fit(); 53 | 54 | log.info("Writing word vectors to text file...."); 55 | // Write word 56 | WordVectorSerializer.writeWordVectors(vec, "word2vec.txt"); 57 | 58 | log.info("Closest Words:"); 59 | Collection lst = vec.wordsNearest("man", 5); 60 | System.out.println(lst); 61 | double cosSim = vec.similarity("cruise", "voyage"); 62 | System.out.println(cosSim); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /Chapter09/data/AreaPlot.java: -------------------------------------------------------------------------------- 1 | package chap9.java.science.data; 2 | 3 | /* 4 | * GRAL: GRAphing Library for Java(R) 5 | * 6 | * (C) Copyright 2009-2013 Erich Seifert , 7 | * Michael Seifert 8 | * 9 | * This file is part of GRAL. 10 | * 11 | * GRAL is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * GRAL is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with GRAL. If not, see . 23 | */ 24 | 25 | import java.awt.Color; 26 | import java.util.Random; 27 | 28 | import de.erichseifert.gral.data.DataSeries; 29 | import de.erichseifert.gral.data.DataSource; 30 | import de.erichseifert.gral.data.DataTable; 31 | import de.erichseifert.gral.examples.ExamplePanel; 32 | import de.erichseifert.gral.plots.XYPlot; 33 | import de.erichseifert.gral.plots.areas.AreaRenderer; 34 | import de.erichseifert.gral.plots.areas.DefaultAreaRenderer2D; 35 | import de.erichseifert.gral.plots.areas.LineAreaRenderer2D; 36 | import de.erichseifert.gral.plots.lines.DefaultLineRenderer2D; 37 | import de.erichseifert.gral.plots.lines.LineRenderer; 38 | import de.erichseifert.gral.plots.points.DefaultPointRenderer2D; 39 | import de.erichseifert.gral.plots.points.PointRenderer; 40 | import de.erichseifert.gral.ui.InteractivePanel; 41 | import de.erichseifert.gral.util.GraphicsUtils; 42 | import de.erichseifert.gral.util.Insets2D; 43 | 44 | public class AreaPlot extends ExamplePanel { 45 | /** Version id for serialization. */ 46 | private static final long serialVersionUID = 3287044991898775949L; 47 | 48 | /** Instance to generate random data values. */ 49 | private static final Random random = new Random(); 50 | 51 | public AreaPlot() { 52 | // Generate data 53 | DataTable data = new DataTable(Double.class, Double.class, Double.class, Double.class); 54 | for (double x = 0.0; x < 50; x ++) { 55 | double y1 = Double.NaN, y2 = Double.NaN, y3 = Double.NaN; 56 | y1 = random.nextGaussian(); 57 | y2 = random.nextGaussian(); 58 | y3 = random.nextGaussian(); 59 | data.add(x, y1, y2, y3); 60 | } 61 | 62 | // Create data series 63 | DataSeries data1 = new DataSeries("series 1", data, 0, 1); 64 | DataSeries data2 = new DataSeries("series 2", data, 0, 2); 65 | DataSeries data3 = new DataSeries("series 3", data, 0, 3); 66 | 67 | // Create new xy-plot 68 | XYPlot plot = new XYPlot(data1, data2, data3); 69 | plot.setLegendVisible(true); 70 | plot.setInsets(new Insets2D.Double(20.0, 40.0, 20.0, 20.0)); 71 | 72 | // Format data series 73 | formatFilledArea(plot, data1, COLOR2); 74 | formatFilledArea(plot, data2, COLOR1); 75 | formatLineArea(plot, data3, GraphicsUtils.deriveDarker(COLOR1)); 76 | 77 | // Add plot to Swing component 78 | add(new InteractivePanel(plot)); 79 | } 80 | 81 | private static void formatFilledArea(XYPlot plot, DataSource data, Color color) { 82 | PointRenderer point = new DefaultPointRenderer2D(); 83 | point.setColor(color); 84 | plot.setPointRenderer(data, point); 85 | LineRenderer line = new DefaultLineRenderer2D(); 86 | line.setColor(color); 87 | line.setGap(3.0); 88 | line.setGapRounded(true); 89 | plot.setLineRenderer(data, line); 90 | AreaRenderer area = new DefaultAreaRenderer2D(); 91 | area.setColor(GraphicsUtils.deriveWithAlpha(color, 64)); 92 | plot.setAreaRenderer(data, area); 93 | } 94 | 95 | private static void formatLineArea(XYPlot plot, DataSource data, Color color) { 96 | PointRenderer point = new DefaultPointRenderer2D(); 97 | point.setColor(color); 98 | plot.setPointRenderer(data, point); 99 | plot.setLineRenderer(data, null); 100 | AreaRenderer area = new LineAreaRenderer2D(); 101 | area.setGap(3.0); 102 | area.setColor(color); 103 | plot.setAreaRenderer(data, area); 104 | } 105 | 106 | @Override 107 | public String getTitle() { 108 | return "Area plot"; 109 | } 110 | 111 | @Override 112 | public String getDescription() { 113 | return "Area plot of three series with different styling"; 114 | } 115 | 116 | public static void main(String[] args) { 117 | new AreaPlot().showInFrame(); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /Chapter09/data/HistogramPlot.java: -------------------------------------------------------------------------------- 1 | package chap9.java.science.data; 2 | 3 | /* 4 | * GRAL: GRAphing Library for Java(R) 5 | * 6 | * (C) Copyright 2009-2013 Erich Seifert , 7 | * Michael Seifert 8 | * 9 | * This file is part of GRAL. 10 | * 11 | * GRAL is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * GRAL is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with GRAL. If not, see . 23 | */ 24 | 25 | import java.util.Random; 26 | 27 | import de.erichseifert.gral.data.DataSource; 28 | import de.erichseifert.gral.data.DataTable; 29 | import de.erichseifert.gral.data.EnumeratedData; 30 | import de.erichseifert.gral.data.statistics.Histogram1D; 31 | import de.erichseifert.gral.data.statistics.Statistics; 32 | import de.erichseifert.gral.examples.ExamplePanel; 33 | import de.erichseifert.gral.plots.BarPlot; 34 | import de.erichseifert.gral.ui.InteractivePanel; 35 | import de.erichseifert.gral.util.GraphicsUtils; 36 | import de.erichseifert.gral.util.Insets2D; 37 | import de.erichseifert.gral.util.MathUtils; 38 | import de.erichseifert.gral.util.Orientation; 39 | 40 | 41 | public class HistogramPlot extends ExamplePanel { 42 | /** Version id for serialization. */ 43 | private static final long serialVersionUID = 4458280577519421950L; 44 | 45 | private static final int SAMPLE_COUNT = 1000; 46 | 47 | //@SuppressWarnings("unchecked") 48 | public HistogramPlot() { 49 | // Create example data 50 | Random random = new Random(); 51 | DataTable data = new DataTable(Double.class); 52 | for (int i = 0; i < SAMPLE_COUNT; i++) { 53 | data.add(random.nextGaussian()); 54 | } 55 | 56 | // Create histogram from data 57 | Histogram1D histogram = new Histogram1D(data, Orientation.VERTICAL, 58 | new Number[] {-4.0, -3.2, -2.4, -1.6, -0.8, 0.0, 0.8, 1.6, 2.4, 3.2, 4.0}); 59 | // Create a second dimension (x axis) for plotting 60 | DataSource histogram2d = new EnumeratedData(histogram, (-4.0 + -3.2)/2.0, 0.8); 61 | 62 | // Create new bar plot 63 | BarPlot plot = new BarPlot(histogram2d); 64 | 65 | // Format plot 66 | plot.setInsets(new Insets2D.Double(20.0, 65.0, 50.0, 40.0)); 67 | plot.getTitle().setText( 68 | String.format("Distribution of %d random samples", data.getRowCount())); 69 | plot.setBarWidth(0.78); 70 | 71 | // Format x axis 72 | plot.getAxisRenderer(BarPlot.AXIS_X).setTickAlignment(0.0); 73 | plot.getAxisRenderer(BarPlot.AXIS_X).setTickSpacing(0.8); 74 | plot.getAxisRenderer(BarPlot.AXIS_X).setMinorTicksVisible(false); 75 | // Format y axis 76 | plot.getAxis(BarPlot.AXIS_Y).setRange(0.0, 77 | MathUtils.ceil(histogram.getStatistics().get(Statistics.MAX)*1.1, 25.0)); 78 | plot.getAxisRenderer(BarPlot.AXIS_Y).setTickAlignment(0.0); 79 | plot.getAxisRenderer(BarPlot.AXIS_Y).setMinorTicksVisible(false); 80 | plot.getAxisRenderer(BarPlot.AXIS_Y).setIntersection(-4.4); 81 | 82 | // Format bars 83 | plot.getPointRenderer(histogram2d).setColor( 84 | GraphicsUtils.deriveWithAlpha(COLOR1, 128)); 85 | plot.getPointRenderer(histogram2d).setValueVisible(true); 86 | 87 | // Add plot to Swing component 88 | InteractivePanel panel = new InteractivePanel(plot); 89 | panel.setPannable(false); 90 | panel.setZoomable(false); 91 | add(panel); 92 | } 93 | 94 | @Override 95 | public String getTitle() { 96 | return "Histogram plot"; 97 | } 98 | 99 | @Override 100 | public String getDescription() { 101 | return String.format("Histogram of %d samples", SAMPLE_COUNT); 102 | } 103 | 104 | public static void main(String[] args) { 105 | new HistogramPlot().showInFrame(); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /Chapter09/data/ScatterPlot.java: -------------------------------------------------------------------------------- 1 | package chap9.java.science.data; 2 | 3 | /* 4 | * GRAL: GRAphing Library for Java(R) 5 | * 6 | * (C) Copyright 2009-2013 Erich Seifert , 7 | * Michael Seifert 8 | * 9 | * This file is part of GRAL. 10 | * 11 | * GRAL is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * GRAL is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with GRAL. If not, see . 23 | */ 24 | 25 | import java.awt.BorderLayout; 26 | import java.util.Random; 27 | 28 | import de.erichseifert.gral.data.DataTable; 29 | import de.erichseifert.gral.examples.ExamplePanel; 30 | import de.erichseifert.gral.plots.XYPlot; 31 | import de.erichseifert.gral.ui.InteractivePanel; 32 | import de.erichseifert.gral.util.Insets2D; 33 | 34 | 35 | public class ScatterPlot extends ExamplePanel { 36 | /** Version id for serialization. */ 37 | private static final long serialVersionUID = -412699430625953887L; 38 | 39 | private static final int SAMPLE_COUNT = 100000; 40 | /** Instance to generate random data values. */ 41 | private static final Random random = new Random(); 42 | 43 | @SuppressWarnings("unchecked") 44 | public ScatterPlot() { 45 | // Generate 100,000 data points 46 | DataTable data = new DataTable(Double.class, Double.class); 47 | for (int i = 0; i <= SAMPLE_COUNT; i++) { 48 | data.add(random.nextGaussian()*2.0, random.nextGaussian()*2.0); 49 | } 50 | 51 | // Create a new xy-plot 52 | XYPlot plot = new XYPlot(data); 53 | 54 | // Format plot 55 | plot.setInsets(new Insets2D.Double(20.0, 40.0, 40.0, 40.0)); 56 | plot.getTitle().setText(getDescription()); 57 | 58 | // Format points 59 | plot.getPointRenderer(data).setColor(COLOR1); 60 | 61 | // Add plot to Swing component 62 | add(new InteractivePanel(plot), BorderLayout.CENTER); 63 | } 64 | 65 | @Override 66 | public String getTitle() { 67 | return "Scatter plot"; 68 | } 69 | 70 | @Override 71 | public String getDescription() { 72 | return String.format("Scatter plot with %d data points", SAMPLE_COUNT); 73 | } 74 | 75 | public static void main(String[] args) { 76 | new ScatterPlot().showInFrame(); 77 | } 78 | 79 | } 80 | 81 | -------------------------------------------------------------------------------- /Chapter09/data/SimpleBarPlot.java: -------------------------------------------------------------------------------- 1 | package chap9.java.science.data; 2 | 3 | /* 4 | * GRAL: GRAphing Library for Java(R) 5 | * 6 | * (C) Copyright 2009-2013 Erich Seifert , 7 | * Michael Seifert 8 | * 9 | * This file is part of GRAL. 10 | * 11 | * GRAL is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * GRAL is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with GRAL. If not, see . 23 | */ 24 | 25 | import java.awt.BasicStroke; 26 | import java.awt.Color; 27 | import java.awt.Font; 28 | import java.awt.LinearGradientPaint; 29 | 30 | import de.erichseifert.gral.data.DataTable; 31 | import de.erichseifert.gral.examples.ExamplePanel; 32 | import de.erichseifert.gral.plots.BarPlot; 33 | import de.erichseifert.gral.plots.BarPlot.BarRenderer; 34 | import de.erichseifert.gral.ui.InteractivePanel; 35 | import de.erichseifert.gral.util.GraphicsUtils; 36 | import de.erichseifert.gral.util.Insets2D; 37 | import de.erichseifert.gral.util.Location; 38 | 39 | 40 | public class SimpleBarPlot extends ExamplePanel { 41 | /** Version id for serialization. */ 42 | private static final long serialVersionUID = -2793954497895054530L; 43 | 44 | @SuppressWarnings("unchecked") 45 | public SimpleBarPlot() { 46 | // Create example data 47 | DataTable data = new DataTable(Double.class, Integer.class, String.class); 48 | data.add(0.1, 1, "January"); 49 | data.add(0.2, 3, "February"); 50 | data.add(0.3, -2, "March"); 51 | data.add(0.4, 6, "April"); 52 | data.add(0.5, -4, "May"); 53 | data.add(0.6, 8, "June"); 54 | data.add(0.7, 9, "July"); 55 | data.add(0.8, 11, "August"); 56 | 57 | // Create new bar plot 58 | BarPlot plot = new BarPlot(data); 59 | 60 | // Format plot 61 | plot.setInsets(new Insets2D.Double(40.0, 40.0, 40.0, 40.0)); 62 | plot.setBarWidth(0.075); 63 | 64 | // Format bars 65 | BarRenderer pointRenderer = (BarRenderer) plot.getPointRenderer(data); 66 | pointRenderer.setColor( 67 | new LinearGradientPaint(0f,0f, 0f,1f, 68 | new float[] { 0.0f, 1.0f }, 69 | new Color[] { COLOR1, GraphicsUtils.deriveBrighter(COLOR1) } 70 | ) 71 | ); 72 | /*pointRenderer.setBorderStroke(new BasicStroke(3f)); 73 | pointRenderer.setBorderColor( 74 | new LinearGradientPaint(0f,0f, 0f,1f, 75 | new float[] { 0.0f, 1.0f }, 76 | new Color[] { GraphicsUtils.deriveBrighter(COLOR1), COLOR1 } 77 | ) 78 | );*/ 79 | pointRenderer.setValueVisible(true); 80 | pointRenderer.setValueColumn(2); 81 | pointRenderer.setValueLocation(Location.CENTER); 82 | pointRenderer.setValueColor(GraphicsUtils.deriveDarker(COLOR1)); 83 | pointRenderer.setValueFont(Font.decode(null).deriveFont(Font.BOLD)); 84 | 85 | // Add plot to Swing component 86 | add(new InteractivePanel(plot)); 87 | } 88 | 89 | @Override 90 | public String getTitle() { 91 | return "Bar plot"; 92 | } 93 | 94 | @Override 95 | public String getDescription() { 96 | return "Bar plot with example data and color gradients"; 97 | } 98 | 99 | public static void main(String[] args) { 100 | new SimpleBarPlot().showInFrame(); 101 | } 102 | } 103 | 104 | -------------------------------------------------------------------------------- /Chapter09/data/SimpleBoxPlot.java: -------------------------------------------------------------------------------- 1 | package chap9.java.science.data; 2 | 3 | /* 4 | * GRAL: GRAphing Library for Java(R) 5 | * 6 | * (C) Copyright 2009-2013 Erich Seifert , 7 | * Michael Seifert 8 | * 9 | * This file is part of GRAL. 10 | * 11 | * GRAL is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * GRAL is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with GRAL. If not, see . 23 | */ 24 | 25 | import java.awt.BasicStroke; 26 | import java.awt.Color; 27 | import java.awt.Dimension; 28 | import java.awt.Stroke; 29 | import java.util.Random; 30 | 31 | import de.erichseifert.gral.data.DataSource; 32 | import de.erichseifert.gral.data.DataTable; 33 | import de.erichseifert.gral.examples.ExamplePanel; 34 | import de.erichseifert.gral.plots.BoxPlot; 35 | import de.erichseifert.gral.plots.BoxPlot.BoxWhiskerRenderer; 36 | import de.erichseifert.gral.plots.XYPlot.XYNavigationDirection; 37 | import de.erichseifert.gral.plots.colors.LinearGradient; 38 | import de.erichseifert.gral.plots.colors.ScaledContinuousColorMapper; 39 | import de.erichseifert.gral.ui.InteractivePanel; 40 | import de.erichseifert.gral.util.DataUtils; 41 | import de.erichseifert.gral.util.GraphicsUtils; 42 | import de.erichseifert.gral.util.Insets2D; 43 | 44 | 45 | public class SimpleBoxPlot extends ExamplePanel { 46 | /** Version id for serialization. */ 47 | private static final long serialVersionUID = 5228891435595348789L; 48 | private static final int SAMPLE_COUNT = 50; 49 | private static final Random random = new Random(); 50 | 51 | @SuppressWarnings("unchecked") 52 | public SimpleBoxPlot() { 53 | setPreferredSize(new Dimension(400, 600)); 54 | 55 | // Create example data 56 | DataTable data = new DataTable(Integer.class, Integer.class, Integer.class); 57 | for (int i = 0; i < SAMPLE_COUNT; i++) { 58 | int x = (int) Math.round(5.0*random.nextGaussian()); 59 | int y = (int) Math.round(5.0*random.nextGaussian()); 60 | int z = (int) Math.round(5.0*random.nextGaussian()); 61 | data.add(x, y, z); 62 | } 63 | 64 | // Create new box-and-whisker plot 65 | DataSource boxData = BoxPlot.createBoxData(data); 66 | BoxPlot plot = new BoxPlot(boxData); 67 | 68 | // Format plot 69 | plot.setInsets(new Insets2D.Double(20.0, 50.0, 40.0, 20.0)); 70 | 71 | // Format axes 72 | plot.getAxisRenderer(BoxPlot.AXIS_X).setCustomTicks( 73 | DataUtils.map( 74 | new Double[] {1.0, 2.0, 3.0}, 75 | new String[] {"Column 1", "Column 2", "Column 3"} 76 | ) 77 | ); 78 | 79 | // Format boxes 80 | /*Stroke stroke = new BasicStroke(2f); 81 | ScaledContinuousColorMapper colors = 82 | new LinearGradient(GraphicsUtils.deriveBrighter(COLOR1), Color.WHITE); 83 | colors.setRange(1.0, 3.0);*/ 84 | 85 | BoxWhiskerRenderer pointRenderer = 86 | (BoxWhiskerRenderer) plot.getPointRenderer(boxData); 87 | /*pointRenderer.setWhiskerStroke(stroke); 88 | pointRenderer.setBoxBorderStroke(stroke); 89 | pointRenderer.setBoxBackground(colors);*/ 90 | pointRenderer.setBoxBorderColor(COLOR1); 91 | pointRenderer.setWhiskerColor(COLOR1); 92 | pointRenderer.setCenterBarColor(COLOR1); 93 | 94 | plot.getNavigator().setDirection(XYNavigationDirection.VERTICAL); 95 | 96 | // Add plot to Swing component 97 | InteractivePanel panel = new InteractivePanel(plot); 98 | add(panel); 99 | } 100 | 101 | @Override 102 | public String getTitle() { 103 | return "Box-and-whisker plot"; 104 | } 105 | 106 | @Override 107 | public String getDescription() { 108 | return String.format("Three box-and-whisker plots created from %d random samples", SAMPLE_COUNT); 109 | } 110 | 111 | public static void main(String[] args) { 112 | new SimpleBoxPlot().showInFrame(); 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /Chapter09/data/SimplePiePlot.java: -------------------------------------------------------------------------------- 1 | package chap9.java.science.data; 2 | 3 | /* 4 | * GRAL: GRAphing Library for Java(R) 5 | * 6 | * (C) Copyright 2009-2013 Erich Seifert , 7 | * Michael Seifert 8 | * 9 | * This file is part of GRAL. 10 | * 11 | * GRAL is free software: you can redistribute it and/or modify 12 | * it under the terms of the GNU Lesser General Public License as published by 13 | * the Free Software Foundation, either version 3 of the License, or 14 | * (at your option) any later version. 15 | * 16 | * GRAL is distributed in the hope that it will be useful, 17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | * GNU Lesser General Public License for more details. 20 | * 21 | * You should have received a copy of the GNU Lesser General Public License 22 | * along with GRAL. If not, see . 23 | */ 24 | 25 | import java.awt.BorderLayout; 26 | import java.awt.Color; 27 | import java.awt.Font; 28 | import java.util.Random; 29 | 30 | import de.erichseifert.gral.data.DataTable; 31 | import de.erichseifert.gral.examples.ExamplePanel; 32 | import de.erichseifert.gral.plots.PiePlot; 33 | import de.erichseifert.gral.plots.PiePlot.PieSliceRenderer; 34 | import de.erichseifert.gral.plots.colors.LinearGradient; 35 | import de.erichseifert.gral.ui.InteractivePanel; 36 | import de.erichseifert.gral.util.Insets2D; 37 | 38 | 39 | public class SimplePiePlot extends ExamplePanel { 40 | /** Version id for serialization. */ 41 | private static final long serialVersionUID = -3039317265508932299L; 42 | 43 | private static final int SAMPLE_COUNT = 10; 44 | /** Instance to generate random data values. */ 45 | private static Random random = new Random(); 46 | 47 | @SuppressWarnings("unchecked") 48 | public SimplePiePlot() { 49 | // Create data 50 | DataTable data = new DataTable(Integer.class); 51 | for (int i = 0; i < SAMPLE_COUNT; i++) { 52 | int val = random.nextInt(8) + 2; 53 | data.add((random.nextDouble() <= 0.15) ? -val : val); 54 | } 55 | 56 | // Create new pie plot 57 | PiePlot plot = new PiePlot(data); 58 | 59 | // Format plot 60 | plot.getTitle().setText(getDescription()); 61 | // Change relative size of pie 62 | plot.setRadius(0.9); 63 | // Display a legend 64 | plot.setLegendVisible(true); 65 | // Add some margin to the plot area 66 | plot.setInsets(new Insets2D.Double(20.0, 40.0, 40.0, 40.0)); 67 | 68 | PieSliceRenderer pointRenderer = 69 | (PieSliceRenderer) plot.getPointRenderer(data); 70 | // Change relative size of inner region 71 | pointRenderer.setInnerRadius(0.4); 72 | // Change the width of gaps between segments 73 | pointRenderer.setGap(0.2); 74 | // Change the colors 75 | LinearGradient colors = new LinearGradient(COLOR1, COLOR2); 76 | pointRenderer.setColor(colors); 77 | // Show labels 78 | pointRenderer.setValueVisible(true); 79 | pointRenderer.setValueColor(Color.WHITE); 80 | pointRenderer.setValueFont(Font.decode(null).deriveFont(Font.BOLD)); 81 | 82 | // Add plot to Swing component 83 | add(new InteractivePanel(plot), BorderLayout.CENTER); 84 | } 85 | 86 | @Override 87 | public String getTitle() { 88 | return "Donut plot"; 89 | } 90 | 91 | @Override 92 | public String getDescription() { 93 | return String.format("Donut plot of %d random data values", SAMPLE_COUNT); 94 | } 95 | 96 | public static void main(String[] args) { 97 | new SimplePiePlot().showInFrame(); 98 | } 99 | } 100 | 101 | -------------------------------------------------------------------------------- /Chapter09/data/SineGraph.java: -------------------------------------------------------------------------------- 1 | /* 2 | * GRAL: GRAphing Library for Java(R) 3 | * 4 | * (C) Copyright 2009-2013 Erich Seifert , 5 | * Michael Seifert 6 | * 7 | * This file is part of GRAL. 8 | * 9 | * GRAL is free software: you can redistribute it and/or modify 10 | * it under the terms of the GNU Lesser General Public License as published by 11 | * the Free Software Foundation, either version 3 of the License, or 12 | * (at your option) any later version. 13 | * 14 | * GRAL is distributed in the hope that it will be useful, 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 | * GNU Lesser General Public License for more details. 18 | * 19 | * You should have received a copy of the GNU Lesser General Public License 20 | * along with GRAL. If not, see . 21 | */ 22 | package chap9.java.science.data; 23 | 24 | import java.awt.Color; 25 | import java.io.FileNotFoundException; 26 | import java.io.IOException; 27 | 28 | import javax.swing.JFrame; 29 | 30 | import de.erichseifert.gral.data.DataTable; 31 | import de.erichseifert.gral.plots.XYPlot; 32 | import de.erichseifert.gral.plots.lines.DefaultLineRenderer2D; 33 | import de.erichseifert.gral.plots.lines.LineRenderer; 34 | import de.erichseifert.gral.ui.InteractivePanel; 35 | 36 | public class SineGraph extends JFrame { 37 | private static final long serialVersionUID = 1L; 38 | 39 | public SineGraph() throws FileNotFoundException, IOException { 40 | setDefaultCloseOperation(EXIT_ON_CLOSE); 41 | setSize(1600, 1400); 42 | 43 | DataTable data = new DataTable(Double.class, Double.class); 44 | for (double x = -5.0; x <= 5.0; x+=0.25) { 45 | double y = 5.0*Math.sin(x); 46 | data.add(x, y); 47 | } 48 | 49 | XYPlot plot = new XYPlot(data); 50 | getContentPane().add(new InteractivePanel(plot)); 51 | LineRenderer lines = new DefaultLineRenderer2D(); 52 | plot.setLineRenderer(data, lines); 53 | Color color = new Color(0.0f, 0.0f, 0.0f); 54 | plot.getPointRenderer(data).setColor(color); 55 | plot.getLineRenderer(data).setColor(color); 56 | } 57 | 58 | public static void main(String[] args) { 59 | SineGraph frame = null; 60 | try { 61 | frame = new SineGraph(); 62 | } catch (IOException e) { 63 | } 64 | frame.setVisible(true); 65 | } 66 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Java Data Science Cookbook 5 | This is the code repository for [Java Data Science Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/java-data-science-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781787122536), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 6 | ## About the Book 7 | If you are looking to build data science models that are good for production, Java has come to the rescue. With the aid of strong libraries such as MLlib, Weka, DL4j, and more, you can efficiently perform all the data science tasks you need to. 8 | ## Instructions and Navigation 9 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 10 | 11 | 12 | 13 | The code will look like the following: 14 | ``` 15 | classVals = new ArrayList(); 16 | for (int i = 0; i < 5; i++){ 17 | classVals.add("class" + (i + 1)); 18 | } 19 | ``` 20 | 21 | We have used Java to solve real-world data science problems. Our focus was to deliver content that can be effective for anyone who wants to know how to solve problems with Java. A minimum knowledge of Java is required, such as classes, objects, methods, arguments and parameters, exceptions, and exporting Java Archive (JAR) files. The code is well supported with narrations, information, and tips to help the readers understand the 22 | context and purpose. The theories behind the problems solved in this book, on many occasions, are not thoroughly discussed, but references for interested readers are provided whenever necessary. 23 | 24 | ## Related Products 25 | * [Practical Data Science Cookbook - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/practical-data-science-cookbook-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781787129627) 26 | 27 | * [Apache Spark for Data Science Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-data-science-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781785880100) 28 | 29 | * [Mastering Java for Data Science](https://www.packtpub.com/big-data-and-business-intelligence/mastering-java-data-science?utm_source=github&utm_medium=repository&utm_campaign=9781782174271) 30 | 31 | ### Suggestions and Feedback 32 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions. 33 | ### Download a free PDF 34 | 35 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
36 |

https://packt.link/free-ebook/9781787122536

--------------------------------------------------------------------------------