├── Chapter01
    ├── CleaningData.java
    ├── FileListing.java
    ├── JsonReading.java
    ├── JsonWriting.java
    ├── JsoupTesting.java
    ├── TestDB.java
    ├── TestJdom.java
    ├── TestRecursiveDirectoryTraversal.java
    ├── TestTika.java
    ├── TestTsv.java
    ├── TestUnivocity.java
    ├── TextFileReadApache.java
    ├── TextFileReadJava.java
    └── WebdataExtractionSelenium.java
├── Chapter02
    ├── IndexFiles.java
    └── SearchFiles.java
├── Chapter03
    └── chap3
    │   └── java
    │       └── science
    │           └── data
    │               ├── AggregateStats.java
    │               ├── AnovaTest.java
    │               ├── ChiSquareTest.java
    │               ├── CovarianceTest.java
    │               ├── DescriptiveStats.java
    │               ├── FrequencyStats.java
    │               ├── GLSRegressionTest.java
    │               ├── KSTest.java
    │               ├── OLSRegressionTest.java
    │               ├── PearsonTest.java
    │               ├── RegressionTest.java
    │               ├── SummaryStats.java
    │               ├── TTest.java
    │               ├── WordFrequencyStatsApache.java
    │               └── WordFrequencyStatsJava.java
├── Chapter04
    └── Code
    │   ├── B05916_04_01.png
    │   ├── B05916_04_02.png
    │   ├── WekaArffTest.java
    │   ├── WekaAssociationRuleTest.java
    │   ├── WekaCVTest.java
    │   ├── WekaClassesToClusterTest.java
    │   ├── WekaClusterTest.java
    │   ├── WekaFeatureSelectionTest.java
    │   ├── WekaFilteredClassifierTest.java
    │   ├── WekaLinearRegressionTest.java
    │   ├── WekaLogisticRegressionTest.java
    │   └── WekaTrainTest.java
├── Chapter05
    └── chapter-5
    │   ├── JavaMachineLearning.java
    │   ├── MOA.java
    │   ├── Mulan.java
    │   └── StanfordClassifier.java
├── Chapter06
    ├── CosineSimilarity.java
    ├── Lemmatizer.java
    ├── OpenNlpSenToken.java
    ├── SentenceDetection.java
    ├── WekaClassification.java
    └── WordDetection.java
├── Chapter07
    └── Code
    │   ├── KMeansClusteringMlib.java
    │   ├── LinearRegressionMlib.java
    │   ├── OnlineLogisticRegressionTest.java
    │   ├── OnlineLogisticRegressionTrain.java
    │   ├── RandomForestMlib.java
    │   └── ScalaTest.java
├── Chapter08
    ├── Chap-08-Code.rar
    └── Chap-08-Code
    │   └── Code
    │       ├── DBNIrisExample.java
    │       ├── DeepAutoEncoderExample.java
    │       └── Word2VecRawTextExample.java
├── Chapter09
    └── data
    │   ├── AreaPlot.java
    │   ├── HistogramPlot.java
    │   ├── ScatterPlot.java
    │   ├── SimpleBarPlot.java
    │   ├── SimpleBoxPlot.java
    │   ├── SimplePiePlot.java
    │   └── SineGraph.java
├── LICENSE
└── README.md


/Chapter01/CleaningData.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | public class CleaningData {
 4 | 	public static void main(String[] args) throws Exception {
 5 | 		CleaningData clean = new CleaningData();
 6 | 		String text = "Your text here you have got from some file";
 7 | 		String cleanedText = clean.cleanText(text);
 8 | 		//Process cleanedText
 9 | 	}
10 | 
11 | 	public String cleanText(String text){
12 | 		text = text.replaceAll("[^\\p{ASCII}]","");
13 | 		text = text.replaceAll("\\s+", " "); 
14 | 		text = text.replaceAll("\\p{Cntrl}", ""); 
15 | 		text = text.replaceAll("[^\\p{Print}]", "");
16 | 		text = text.replaceAll("\\p{C}", "");
17 | 		return text;
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/Chapter01/FileListing.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.File;
 4 | import java.util.List;
 5 | import org.apache.commons.io.FileUtils;
 6 | import org.apache.commons.io.filefilter.TrueFileFilter;
 7 | 
 8 | public class FileListing{
 9 | 	public static void main (String[] args){
10 | 		FileListing fileListing = new FileListing();
11 | 		fileListing.listFiles("Path for the root directory here");
12 | 	}
13 | 	public void listFiles(String rootDir){
14 | 		File dir = new File(rootDir);
15 | 
16 | 		List<File> files = (List<File>) FileUtils.listFiles(dir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE);
17 | 		for (File file : files) {
18 | 			System.out.println("file: " + file.getAbsolutePath());
19 | 		}
20 | 	}
21 | }


--------------------------------------------------------------------------------
/Chapter01/JsonReading.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | 
 4 | import java.io.FileNotFoundException;
 5 | import java.io.FileReader;
 6 | import java.io.IOException;
 7 | import java.util.Iterator;
 8 | import org.json.simple.JSONArray;
 9 | import org.json.simple.JSONObject;
10 | import org.json.simple.parser.JSONParser;
11 | import org.json.simple.parser.ParseException;
12 | 
13 | public class JsonReading {
14 | 	public static void main(String[] args){
15 | 		JsonReading jsonReading = new JsonReading();
16 | 		jsonReading.readJson("C:/testJSON.json");
17 | 	}
18 | 	public void readJson(String inFileName) {
19 | 		JSONParser parser = new JSONParser();
20 | 		try {
21 | 			Object obj = parser.parse(new FileReader(inFileName));
22 | 			JSONObject jsonObject = (JSONObject) obj;
23 | 
24 | 			String name = (String) jsonObject.get("book");
25 | 			System.out.println(name);
26 | 
27 | 			String author = (String) jsonObject.get("author");
28 | 			System.out.println(author);
29 | 
30 | 			JSONArray reviews = (JSONArray) jsonObject.get("messages");
31 | 			Iterator<String> iterator = reviews.iterator();
32 | 			while (iterator.hasNext()) {
33 | 				System.out.println(iterator.next());
34 | 			}
35 | 		} catch (FileNotFoundException e) {
36 | 			//Your exception handling here
37 | 		} catch (IOException e) {
38 | 			//Your exception handling here
39 | 		} catch (ParseException e) {
40 | 			//Your exception handling here
41 | 		}
42 | 	}
43 | }
44 | 
45 | 


--------------------------------------------------------------------------------
/Chapter01/JsonWriting.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.FileWriter;
 4 | import java.io.IOException;
 5 | import org.json.simple.JSONArray;
 6 | import org.json.simple.JSONObject;
 7 | 
 8 | public class JsonWriting {
 9 | 
10 | 	public static void main(String[] args) {
11 | 		JsonWriting jsonWriting = new JsonWriting();
12 | 		jsonWriting.writeJson("C:/testJSON.json");
13 | 	}
14 | 
15 | 	public void writeJson(String outFileName){
16 | 		JSONObject obj = new JSONObject();
17 | 		obj.put("book", "Harry Potter and the Philosopher's Stone");
18 | 		obj.put("author", "J. K. Rowling");
19 | 
20 | 		JSONArray list = new JSONArray();
21 | 		list.add("There are characters in this book that will remind us of all the people we have met. Everybody knows or knew a spoilt, overweight boy like Dudley or a bossy and interfering (yet kind-hearted) girl like Hermione");
22 | 		list.add("Hogwarts is a truly magical place, not only in the most obvious way but also in all the detail that the author has gone to describe it so vibrantly.");
23 | 		list.add("Parents need to know that this thrill-a-minute story, the first in the Harry Potter series, respects kids' intelligence and motivates them to tackle its greater length and complexity, play imaginative games, and try to solve its logic puzzles. ");
24 | 
25 | 		obj.put("messages", list);
26 | 
27 | 		try {
28 | 
29 | 			FileWriter file = new FileWriter(outFileName);
30 | 			file.write(obj.toJSONString());
31 | 			file.flush();
32 | 			file.close();
33 | 
34 | 		} catch (IOException e) {
35 | 			e.printStackTrace();
36 | 		}
37 | 
38 | 		System.out.print(obj);
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/Chapter01/JsoupTesting.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.jsoup.Jsoup;
 6 | import org.jsoup.nodes.Document;
 7 | import org.jsoup.nodes.Element;
 8 | import org.jsoup.select.Elements;
 9 | 
10 | public class JsoupTesting {
11 | 	public static void main(String[] args){
12 | 		JsoupTesting test = new JsoupTesting();
13 | 		test.extractDataWithJsoup("http://www.sustainalytics.com");
14 | 	}
15 | 
16 | 	public void extractDataWithJsoup(String href){
17 | 		Document doc = null;
18 | 		try {
19 | 			doc = Jsoup.connect(href).timeout(10*1000).userAgent("Mozilla").ignoreHttpErrors(true).get();
20 | 		} catch (IOException e) {
21 | 			//Your exception handling here
22 | 		}
23 | 		if(doc != null){
24 | 			String title = doc.title();
25 | 			String text = doc.body().text();
26 | 			Elements links = doc.select("a[href]");
27 | 			for (Element link : links) {
28 | 				String linkHref = link.attr("href");
29 | 				String linkText = link.text();
30 | 				String linkOuterHtml = link.outerHtml(); 
31 | 				String linkInnerHtml = link.html();
32 | 			}
33 | 		}
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/Chapter01/TestDB.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.sql.*;
 4 | import com.mysql.jdbc.jdbc2.optional.MysqlDataSource;
 5 | public class TestDB{
 6 |      public static void main(String[] args){
 7 |           TestDB test = new TestDB();
 8 |           test.readTable("your user name", "your password", "your MySQL server name");
 9 |      }
10 |      public void readTable(String user, String password, String server){
11 |           MysqlDataSource dataSource = new MysqlDataSource();
12 |           dataSource.setUser(user);
13 |           dataSource.setPassword(password);
14 |           dataSource.setServerName(server);
15 |           try{
16 |                Connection conn = dataSource.getConnection();
17 |                Statement stmt = conn.createStatement();
18 |                ResultSet rs = stmt.executeQuery("SELECT * FROM data_science.books");
19 |                while (rs.next()){
20 |                     int id = rs.getInt("id");
21 |                     String book = rs.getString("book_name");
22 |                     String author = rs.getString("author_name");
23 |                     Date dateCreated = rs.getDate("date_created");
24 |                     System.out.format("%s, %s, %s, %s\n", id, book, author, dateCreated);
25 |                }
26 |                rs.close();
27 |                stmt.close();
28 |                conn.close();
29 |           }catch (Exception e){
30 |                //Your exception handling mechanism goes here.
31 |           }
32 |      }
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/Chapter01/TestJdom.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.List;
 6 | 
 7 | import org.jdom2.Document;
 8 | import org.jdom2.Element;
 9 | import org.jdom2.JDOMException;
10 | import org.jdom2.input.SAXBuilder;
11 | 
12 | public class TestJdom {
13 | 
14 | 	public static void main(String[] args){
15 | 		TestJdom test = new TestJdom();
16 | 		test.parseXml("C:/dummyxml.com");
17 | 
18 | 	}
19 | 	public void parseXml(String fileName){
20 | 		SAXBuilder builder = new SAXBuilder();
21 | 		File file = new File(fileName);
22 | 		try {
23 | 			Document document = (Document) builder.build(file);
24 | 			Element rootNode = document.getRootElement();
25 | 			List list = rootNode.getChildren("author");
26 | 			for (int i = 0; i < list.size(); i++) {
27 | 				Element node = (Element) list.get(i);
28 | 				System.out.println("First Name : " + node.getChildText("firstname"));
29 | 				System.out.println("Last Name : " + node.getChildText("lastname"));
30 | 			}
31 | 		} catch (IOException io) {
32 | 			System.out.println(io.getMessage());
33 | 		} catch (JDOMException jdomex) {
34 | 			System.out.println(jdomex.getMessage());
35 | 		}
36 | 
37 | 
38 | 
39 | 	}
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/Chapter01/TestRecursiveDirectoryTraversal.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.File;
 4 | import java.util.HashSet;
 5 | import java.util.Set;
 6 | 
 7 | public class TestRecursiveDirectoryTraversal {
 8 | 	public static void main(String[] args){
 9 | 		System.out.println(listFiles(new File("")).size());
10 | 	}
11 | 	
12 | 	public static Set<File> listFiles(File rootDir) {
13 | 	    Set<File> fileSet = new HashSet<File>();
14 | 	    if(rootDir == null || rootDir.listFiles()==null){
15 | 	        return fileSet;
16 | 	    }
17 | 	    for (File fileOrDir : rootDir.listFiles()) {
18 |              if (fileOrDir.isFile()){
19 |                fileSet.add(fileOrDir);
20 |              }
21 |              else{
22 |                fileSet.addAll(listFiles(fileOrDir));
23 |              }
24 |      }
25 | 
26 | 	    return fileSet;
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/Chapter01/TestTika.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | import org.apache.tika.metadata.Metadata;
 7 | import org.apache.tika.parser.AutoDetectParser;
 8 | import org.apache.tika.parser.ParseContext;
 9 | import org.apache.tika.sax.BodyContentHandler;
10 | 
11 | public class TestTika {
12 |      public static void main(String args[]) throws Exception {
13 |           TestTika tika = new TestTika();
14 |           tika.convertPdf("C:/testPDF.pdf");
15 |      }
16 |      public void convertPdf(String fileName){
17 |           InputStream stream = null;
18 |           try {
19 |               stream = new FileInputStream(fileName);
20 |               AutoDetectParser parser = new AutoDetectParser();
21 |               BodyContentHandler handler = new BodyContentHandler(-1);
22 |               Metadata metadata = new Metadata();
23 |               parser.parse(stream, handler, metadata, new ParseContext());
24 |               System.out.println(handler.toString());
25 |           }catch (Exception e) {
26 |               e.printStackTrace();
27 |           }finally {
28 |               if (stream != null)
29 |                    try {
30 |                         stream.close();
31 |                    } catch (IOException e) {
32 |                         System.out.println("Error closing stream");
33 |                    }
34 |           }
35 |      }
36 | }
37 | 


--------------------------------------------------------------------------------
/Chapter01/TestTsv.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.File;
 4 | import java.util.Arrays;
 5 | import java.util.List;
 6 | 
 7 | import com.univocity.parsers.tsv.TsvParser;
 8 | import com.univocity.parsers.tsv.TsvParserSettings;
 9 | 
10 | public class TestTsv {
11 | 	public void parseTsv(String fileName){
12 | 		TsvParserSettings settings = new TsvParserSettings();
13 | 	    settings.getFormat().setLineSeparator("\n");
14 | 		    TsvParser parser = new TsvParser(settings);
15 | 	    List<String[]> allRows = parser.parseAll(new File(fileName));
16 | 	    for (int i = 0; i < allRows.size(); i++){
17 | 	    	System.out.println(Arrays.asList(allRows.get(i)));
18 | 	    }
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/Chapter01/TestUnivocity.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | 
 4 | import java.io.File;
 5 | import java.util.Arrays;
 6 | import java.util.List;
 7 | 
 8 | import com.univocity.parsers.common.processor.RowListProcessor;
 9 | import com.univocity.parsers.csv.CsvParser;
10 | import com.univocity.parsers.csv.CsvParserSettings;
11 | 
12 | public class TestUnivocity {
13 | 		public void parseCSV(String fileName){
14 | 		    CsvParserSettings parserSettings = new CsvParserSettings();
15 | 		    parserSettings.setLineSeparatorDetectionEnabled(true);
16 | 		    RowListProcessor rowProcessor = new RowListProcessor();
17 | 		   parserSettings.setRowProcessor(rowProcessor);
18 | 		    parserSettings.setHeaderExtractionEnabled(true);
19 | 		    CsvParser parser = new CsvParser(parserSettings);
20 | 		    parser.parse(new File(fileName));
21 | 
22 | 		    String[] headers = rowProcessor.getHeaders();
23 | 		    List<String[]> rows = rowProcessor.getRows();
24 | 		    for (int i = 0; i < rows.size(); i++){
25 | 		    	System.out.println(Arrays.asList(rows.get(i)));
26 | 		    }
27 | 		}
28 | 		
29 | 		public static void main(String[] args){
30 | 			TestUnivocity test = new TestUnivocity();
31 | 			test.parseCSV("C:/testCSV.csv");
32 | 		}
33 | }
34 | 


--------------------------------------------------------------------------------
/Chapter01/TextFileReadApache.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | 
 6 | import org.apache.commons.io.FileUtils;
 7 | 
 8 | public class TextFileReadApache {
 9 | 	public static void main(String[] args){
10 | 		TextFileReadApache test = new TextFileReadApache();
11 | 		test.readFile("C:/dummy.txt");
12 | 
13 | 	}
14 | 	public void readFile(String fileName){
15 | 		File file = new File(fileName);
16 | 		String text = "";
17 | 		try {
18 | 			text = FileUtils.readFileToString(file, "UTF-8");
19 | 		} catch (IOException e) {
20 | 			System.out.println("Error reading " + file.getAbsolutePath());
21 | 		}
22 | 		//process text
23 | 	}
24 | }
25 | 


--------------------------------------------------------------------------------
/Chapter01/TextFileReadJava.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import java.io.IOException;
 4 | import java.nio.file.Files;
 5 | import java.nio.file.Paths;
 6 | import java.util.stream.Stream;
 7 | 
 8 | public class TextFileReadJava {
 9 | 	public static void main(String[] args){
10 | 		TextFileReadJava test = new TextFileReadJava();
11 | 		test.readTextFile("C:/dummy.txt");
12 | 	}
13 | 	public void readTextFile(String file){
14 | 		try (Stream<String> stream = Files.lines(Paths.get(file))) {
15 | 			stream.forEach(System.out::println);
16 | 		} catch (IOException e) {
17 | 			//Your exception handling here
18 | 		}
19 | 
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/Chapter01/WebdataExtractionSelenium.java:
--------------------------------------------------------------------------------
 1 | package chap1.java.science.data;
 2 | 
 3 | import org.openqa.selenium.By;
 4 | import org.openqa.selenium.WebDriver;
 5 | import org.openqa.selenium.WebElement;
 6 | import org.openqa.selenium.firefox.FirefoxDriver;
 7 | 
 8 | public class WebdataExtractionSelenium {
 9 | 	public static void main(String[] args) {
10 |         WebDriver driver = new FirefoxDriver();
11 |         driver.get("http://cogenglab.csd.uwo.ca/rushdi.htm");
12 | 
13 |         WebElement webElement = driver.findElement(By.xpath("//*[@id='content']"));
14 | 		System.out.println(webElement.getText());
15 | 		
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/Chapter02/IndexFiles.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.demo;
  2 | 
  3 | import org.apache.lucene.analysis.Analyzer;
  4 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
  5 | import org.apache.lucene.document.Document;
  6 | import org.apache.lucene.document.Field;
  7 | import org.apache.lucene.document.LongPoint;
  8 | import org.apache.lucene.document.StringField;
  9 | import org.apache.lucene.document.TextField;
 10 | import org.apache.lucene.index.IndexWriter;
 11 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 12 | import org.apache.lucene.index.IndexWriterConfig;
 13 | import org.apache.lucene.index.Term;
 14 | import org.apache.lucene.store.Directory;
 15 | import org.apache.lucene.store.FSDirectory;
 16 | 
 17 | import java.io.BufferedReader;
 18 | import java.io.IOException;
 19 | import java.io.InputStream;
 20 | import java.io.InputStreamReader;
 21 | import java.nio.charset.StandardCharsets;
 22 | import java.nio.file.FileVisitResult;
 23 | import java.nio.file.Files;
 24 | import java.nio.file.Path;
 25 | import java.nio.file.Paths;
 26 | import java.nio.file.SimpleFileVisitor;
 27 | import java.nio.file.attribute.BasicFileAttributes;
 28 | import java.util.Date;
 29 | 
 30 | public class IndexFiles {
 31 | 	static void indexDocs(final IndexWriter writer, Path path) throws IOException {
 32 | 		if (Files.isDirectory(path)) {
 33 | 			Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
 34 | 				@Override
 35 | 				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
 36 | 					try {
 37 | 						indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
 38 | 					} catch (IOException ignore) {
 39 | 					}
 40 | 					return FileVisitResult.CONTINUE;
 41 | 				}
 42 | 			}
 43 | 					);
 44 | 		} else {
 45 | 			indexDoc(writer, path, Files.getLastModifiedTime(path).toMillis());
 46 | 		}
 47 | 	}
 48 | 
 49 | 	static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {
 50 | 		try (InputStream stream = Files.newInputStream(file)) {
 51 | 			Document doc = new Document();
 52 | 			Field pathField = new StringField("path", file.toString(), Field.Store.YES);
 53 | 			doc.add(pathField);
 54 | 			doc.add(new LongPoint("modified", lastModified));
 55 | 			doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))));
 56 | 
 57 | 			if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
 58 | 				System.out.println("adding " + file);
 59 | 				writer.addDocument(doc);
 60 | 			} else {
 61 | 				System.out.println("updating " + file);
 62 | 				writer.updateDocument(new Term("path", file.toString()), doc);
 63 | 			}
 64 | 		}
 65 | 	}
 66 | 	public static void main(String[] args) {
 67 | 		String indexPath = "index";
 68 | 		String docsPath = null;
 69 | 		boolean create = true;
 70 | 		for(int i=0;i<args.length;i++) {
 71 | 			if ("-index".equals(args[i])) {
 72 | 				indexPath = args[i+1];
 73 | 				i++;
 74 | 			} else if ("-docs".equals(args[i])) {
 75 | 				docsPath = args[i+1];
 76 | 				i++;
 77 | 			} else if ("-update".equals(args[i])) {
 78 | 				create = false;
 79 | 			}
 80 | 		}
 81 | 
 82 | 		final Path docDir = Paths.get(docsPath);
 83 | 
 84 | 		Date start = new Date();
 85 | 		try {
 86 | 			System.out.println("Indexing to directory '" + indexPath + "'...");
 87 | 
 88 | 			Directory dir = FSDirectory.open(Paths.get(indexPath));
 89 | 			Analyzer analyzer = new StandardAnalyzer();
 90 | 			IndexWriterConfig iwc = new IndexWriterConfig(analyzer);
 91 | 
 92 | 			if (create) {
 93 | 				iwc.setOpenMode(OpenMode.CREATE);
 94 | 			} else {
 95 | 				iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
 96 | 			}
 97 | 			IndexWriter writer = new IndexWriter(dir, iwc);
 98 | 			indexDocs(writer, docDir);
 99 | 
100 | 			writer.close();
101 | 
102 | 			Date end = new Date();
103 | 			System.out.println(end.getTime() - start.getTime() + " total milliseconds");
104 | 
105 | 		} catch (IOException e) {
106 | 		}
107 | 	}
108 | }
109 | 


--------------------------------------------------------------------------------
/Chapter02/SearchFiles.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.demo;
 2 | 
 3 | /*
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | import java.nio.file.Paths;
22 | 
23 | import org.apache.lucene.analysis.Analyzer;
24 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
25 | import org.apache.lucene.document.Document;
26 | import org.apache.lucene.index.DirectoryReader;
27 | import org.apache.lucene.index.IndexReader;
28 | import org.apache.lucene.queryparser.classic.QueryParser;
29 | import org.apache.lucene.search.IndexSearcher;
30 | import org.apache.lucene.search.Query;
31 | import org.apache.lucene.search.ScoreDoc;
32 | import org.apache.lucene.search.TopDocs;
33 | import org.apache.lucene.store.FSDirectory;
34 | 
35 | public class SearchFiles {
36 | 	public static final String INDEX_DIRECTORY = "index";
37 | 	public static final String FIELD_CONTENTS = "contents";
38 | 
39 | 	public static void main(String[] args) throws Exception {
40 | 		IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(INDEX_DIRECTORY)));
41 | 		IndexSearcher indexSearcher = new IndexSearcher(reader);
42 | 
43 | 		Analyzer analyzer = new StandardAnalyzer();
44 | 		QueryParser queryParser = new QueryParser(FIELD_CONTENTS, analyzer);
45 | 		String searchString = "shakespeare";
46 | 		Query query = queryParser.parse(searchString);
47 | 
48 | 		TopDocs results = indexSearcher.search(query, 5);
49 | 		ScoreDoc[] hits = results.scoreDocs;
50 | 
51 | 		int numTotalHits = results.totalHits;
52 | 		System.out.println(numTotalHits + " total matching documents");
53 | 
54 | 		for(int i=0;i<hits.length;++i) {
55 | 			int docId = hits[i].doc;
56 | 			Document d = indexSearcher.doc(docId);
57 | 			System.out.println((i + 1) + ". " + d.get("path") + " score=" + hits[i].score);
58 | 		}
59 | 	}
60 | }


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/AggregateStats.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.descriptive.AggregateSummaryStatistics;
 4 | import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 5 | 
 6 | public class AggregateStats {
 7 | 	public static void main(String[] args){
 8 | 		double[] values1 = {32, 39, 14, 98, 45, 44, 45};
 9 | 		double[] values2 = {34, 89, 67, 0, 15, 0, 56, 88};
10 | 		AggregateStats aggStatTest = new AggregateStats();
11 | 		aggStatTest.getAggregateStats(values1, values2);
12 | 	}
13 | 	public void getAggregateStats(double[] values1, double[] values2){
14 | 		AggregateSummaryStatistics aggregate = new AggregateSummaryStatistics();
15 | 		SummaryStatistics firstSet = aggregate.createContributingStatistics();
16 | 		SummaryStatistics secondSet = aggregate.createContributingStatistics();
17 | 		
18 | 		for(int i = 0; i < values1.length; i++) {
19 | 			firstSet.addValue(values1[i]);
20 | 		}
21 | 		for(int i = 0; i < values2.length; i++) {
22 | 			secondSet.addValue(values2[i]);
23 | 		}
24 | 		
25 | 		double sampleSum = aggregate.getSum();
26 | 		double sampleMean = aggregate.getMean();
27 | 		double sampleStd= aggregate.getStandardDeviation();
28 | 		System.out.println(sampleSum + "\t" + sampleMean + "\t" + sampleStd);
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/AnovaTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.commons.math3.stat.inference.TestUtils;
 7 | 
 8 | public class AnovaTest {
 9 | 	public static void main(String[] args){
10 | 		double[] calorie = {8, 9, 6, 7, 3};
11 | 		double[] fat = {2, 4, 3, 5, 1};
12 | 		double[] carb = {3, 5, 4, 2, 3};
13 | 		double[] control = {2, 2, -1, 0, 3};
14 | 		AnovaTest test = new AnovaTest();
15 | 		test.calculateAnova(calorie, fat, carb, control);
16 | 	}
17 | 	public void calculateAnova(double[] calorie, double[] fat, double[] carb, double[] control){
18 | 		List<double[]> classes = new ArrayList<double[]>();
19 | 		classes.add(calorie);
20 | 		classes.add(fat);
21 | 		classes.add(carb);
22 | 		classes.add(control);
23 | 	
24 | 		System.out.println(TestUtils.oneWayAnovaFValue(classes)); // F-value
25 | 		System.out.println(TestUtils.oneWayAnovaPValue(classes));     // P-value
26 | 		System.out.println(TestUtils.oneWayAnovaTest(classes, 0.05)); 
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/ChiSquareTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.inference.TestUtils;
 4 | 
 5 | public class ChiSquareTest {
 6 | 	public static void main(String[] args){
 7 | 		long[] observed = {43, 21, 25, 42, 57, 59};
 8 | 		double[] expected = {99, 65, 79, 75, 87, 81};
 9 | 		ChiSquareTest test = new ChiSquareTest();
10 | 		test.getChiSquare(observed, expected);
11 | 	}
12 | 	public void getChiSquare(long[] observed, double[] expected){
13 | 		System.out.println(TestUtils.chiSquare(expected, observed));//t statistics
14 | 		System.out.println(TestUtils.chiSquareTest(expected, observed));//p value
15 | 		System.out.println(TestUtils.chiSquareTest(expected, observed, 0.05));
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/CovarianceTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.correlation.Covariance;
 4 | 
 5 | public class CovarianceTest {
 6 | 	public static void main(String[] args){
 7 | 		double[] x = {43, 21, 25, 42, 57, 59};
 8 | 		double[] y = {99, 65, 79, 75, 87, 81};
 9 | 		CovarianceTest test = new CovarianceTest();
10 | 		test.calculateCov(x, y);
11 | 	}
12 | 	public void calculateCov(double[] x, double[] y){
13 | 		double covariance = new Covariance().covariance(x, y, false);//take out false too
14 | 		System.out.println(covariance);
15 | 	}
16 | }
17 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/DescriptiveStats.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
 4 | 
 5 | public class DescriptiveStats {
 6 | 	public static void main(String[] args){
 7 | 		double[] values = {32, 39, 14, 98, 45, 44, 45, 34, 89, 67, 0, 15, 0, 56, 88};
 8 | 		DescriptiveStats descStatTest = new DescriptiveStats();
 9 | 		descStatTest.getDescStats(values);
10 | 		
11 | 	}
12 | 	public void getDescStats(double[] values){
13 | 		DescriptiveStatistics stats = new DescriptiveStatistics();
14 | 		for( int i = 0; i < values.length; i++) {
15 | 		        stats.addValue(values[i]);
16 | 		}
17 | 		double mean = stats.getMean();
18 | 		double std = stats.getStandardDeviation();
19 | 		double median = stats.getPercentile(50);
20 | 		System.out.println(mean + "\t" + std + "\t" + median);
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/FrequencyStats.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.Frequency;
 4 | 
 5 | public class FrequencyStats {
 6 | 	public static void main(String[] args){
 7 | 		double[] values = {32, 39, 14, 98, 45, 44, 45, 34, 89, 67, 0, 15, 0, 56, 88};
 8 | 		FrequencyStats freqTest = new FrequencyStats();
 9 | 		freqTest.getFreqStats(values);
10 | 
11 | 	}
12 | 	public void getFreqStats(double[] values){
13 | 		Frequency freq = new Frequency();
14 | 		for( int i = 0; i < values.length; i++) {
15 | 			freq.addValue(values[i]);
16 | 		}
17 | 
18 | 		for( int i = 0; i < values.length; i++) {
19 | 			System.out.println(freq.getCount(values[i]));
20 | 		}
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/GLSRegressionTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.regression.GLSMultipleLinearRegression;
 4 | 
 5 | public class GLSRegressionTest {
 6 | 	public static void main(String[] args){
 7 | 		double[] y = new double[]{11.0, 12.0, 13.0, 14.0, 15.0, 16.0};
 8 | 		double[][] x = new double[6][];
 9 | 		x[0] = new double[]{0, 0, 0, 0, 0};
10 | 		x[1] = new double[]{2.0, 0, 0, 0, 0};
11 | 		x[2] = new double[]{0, 3.0, 0, 0, 0};
12 | 		x[3] = new double[]{0, 0, 4.0, 0, 0};
13 | 		x[4] = new double[]{0, 0, 0, 5.0, 0};
14 | 		x[5] = new double[]{0, 0, 0, 0, 6.0};          
15 | 		double[][] omega = new double[6][];
16 | 		omega[0] = new double[]{1.1, 0, 0, 0, 0, 0};
17 | 		omega[1] = new double[]{0, 2.2, 0, 0, 0, 0};
18 | 		omega[2] = new double[]{0, 0, 3.3, 0, 0, 0};
19 | 		omega[3] = new double[]{0, 0, 0, 4.4, 0, 0};
20 | 		omega[4] = new double[]{0, 0, 0, 0, 5.5, 0};
21 | 		omega[5] = new double[]{0, 0, 0, 0, 0, 6.6};  
22 | 		GLSRegressionTest test = new GLSRegressionTest();
23 | 		test.calculateOlsRegression(x, y, omega);
24 | 	}
25 | 	public void calculateOlsRegression(double[][] x, double[] y, double[][] omega){
26 | 		GLSMultipleLinearRegression regression = new GLSMultipleLinearRegression();
27 | 		regression.newSampleData(y, x, omega); 		
28 | 		
29 | 		double[] beta = regression.estimateRegressionParameters();       
30 | 		double[] residuals = regression.estimateResiduals();
31 | 		double[][] parametersVariance = regression.estimateRegressionParametersVariance();
32 | 		double regressandVariance = regression.estimateRegressandVariance();
33 | 		double sigma = regression.estimateRegressionStandardError();
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/KSTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.inference.TestUtils;
 4 | 
 5 | public class KSTest {
 6 | 	public static void main(String[] args){
 7 | 		double[] x = {43, 21, 25, 42, 57, 59};
 8 | 		double[] y = {99, 65, 79, 75, 87, 81};
 9 | 		KSTest test = new KSTest();
10 | 		test.calculateKs(x, y);
11 | 	}
12 | 	public void calculateKs(double[] x, double[] y){
13 | 		double d = TestUtils.kolmogorovSmirnovStatistic(x, y);
14 | 		System.out.println(TestUtils.kolmogorovSmirnovTest(x, y, false));
15 | 		System.out.println(TestUtils.exactP(d, x.length, y.length, false));
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/OLSRegressionTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.regression.OLSMultipleLinearRegression;
 4 | 
 5 | public class OLSRegressionTest {
 6 | 	public static void main(String[] args){
 7 | 		double[] y = new double[]{11.0, 12.0, 13.0, 14.0, 15.0, 16.0};
 8 | 		double[][] x = new double[6][];
 9 | 		x[0] = new double[]{0, 0, 0, 0, 0};
10 | 		x[1] = new double[]{2.0, 0, 0, 0, 0};
11 | 		x[2] = new double[]{0, 3.0, 0, 0, 0};
12 | 		x[3] = new double[]{0, 0, 4.0, 0, 0};
13 | 		x[4] = new double[]{0, 0, 0, 5.0, 0};
14 | 		x[5] = new double[]{0, 0, 0, 0, 6.0};   
15 | 		OLSRegressionTest test = new OLSRegressionTest();
16 | 		test.calculateOlsRegression(x, y);
17 | 	}
18 | 	public void calculateOlsRegression(double[][] x, double[] y){
19 | 		OLSMultipleLinearRegression regression = new OLSMultipleLinearRegression();
20 | 		regression.newSampleData(y, x);
21 | 		
22 | 		double[] beta = regression.estimateRegressionParameters();       
23 | 		double[] residuals = regression.estimateResiduals();
24 | 		double[][] parametersVariance = regression.estimateRegressionParametersVariance();
25 | 		double regressandVariance = regression.estimateRegressandVariance();
26 | 		double rSquared = regression.calculateRSquared();
27 | 		double sigma = regression.estimateRegressionStandardError();
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/PearsonTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.correlation.PearsonsCorrelation;
 4 | 
 5 | public class PearsonTest {
 6 | 	public static void main(String[] args){
 7 | 		double[] x = {43, 21, 25, 42, 57, 59};
 8 | 		double[] y = {99, 65, 79, 75, 87, 81};
 9 | 		PearsonTest test = new PearsonTest();
10 | 		test.calculatePearson(x, y);
11 | 	}
12 | 	public void calculatePearson(double[] x, double[] y){
13 | 		PearsonsCorrelation pCorrelation = new PearsonsCorrelation();
14 | 		double cor = pCorrelation.correlation(x, y);//take out false too
15 | 		System.out.println(cor);
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/RegressionTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.regression.SimpleRegression;
 4 | 
 5 | public class RegressionTest {
 6 | 	
 7 | 	public static void main(String[] args){
 8 | 		double[][] data = { { 1, 3 }, {2, 5 }, {3, 7 }, {4, 14 }, {5, 11 }};
 9 | 		RegressionTest test = new RegressionTest();
10 | 		test.calculateRegression(data);
11 | 	}
12 | 	public void calculateRegression(double[][] data){
13 | 		SimpleRegression regression = new SimpleRegression();
14 | 		regression.addData(data);
15 | 		System.out.println(regression.getIntercept());
16 | 		System.out.println(regression.getSlope());
17 | 		System.out.println(regression.getSlopeStdErr());
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/SummaryStats.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 4 | 
 5 | public class SummaryStats {
 6 | 	public static void main(String[] args){
 7 | 		double[] values = {32, 39, 14, 98, 45, 44, 45, 34, 89, 67, 0, 15, 0, 56, 88};
 8 | 		SummaryStats summaryStatTest = new SummaryStats();
 9 | 		summaryStatTest.getSummaryStats(values);
10 | 	}
11 | 	public void getSummaryStats(double[] values){
12 | 		SummaryStatistics stats = new SummaryStatistics();
13 | 		for( int i = 0; i < values.length; i++) {
14 | 		        stats.addValue(values[i]);
15 | 		}
16 | 		double mean = stats.getMean();
17 | 		double std = stats.getStandardDeviation();
18 | 		System.out.println(mean + "\t" + std);
19 | 	}
20 | }
21 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/TTest.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.inference.TestUtils;
 4 | 
 5 | public class TTest {
 6 | 	public static void main(String[] args){
 7 | 		double[] sample1 = {43, 21, 25, 42, 57, 59};
 8 | 		double[] sample2 = {99, 65, 79, 75, 87, 81};
 9 | 		TTest test = new TTest();
10 | 		test.getTtest(sample1, sample2);
11 | 	}
12 | 	public void getTtest(double[] sample1, double[] sample2){
13 | 		System.out.println(TestUtils.pairedT(sample1, sample2));//t statistics
14 | 		System.out.println(TestUtils.pairedTTest(sample1, sample2));//p value
15 | 		System.out.println(TestUtils.pairedTTest(sample1, sample2, 0.05));
16 | 	}
17 | }
18 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/WordFrequencyStatsApache.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import org.apache.commons.math3.stat.Frequency;
 4 | 
 5 | public class WordFrequencyStatsApache {
 6 | 	public static void main(String[] args){
 7 | 		String str = "Horatio says 'tis but our fantasy, "
 8 | 				+ "And will not let belief take hold of him "
 9 | 				+ "Touching this dreaded sight, twice seen of us. "
10 | 				+ "Therefore I have entreated him along, 35"
11 | 				+ "With us to watch the minutes of this night, "
12 | 				+ "That, if again this apparition come, "
13 | 				+ "He may approve our eyes and speak to it.";
14 | 		String[] words = str.toLowerCase().split("\\W+");
15 | 		WordFrequencyStatsApache freqTest = new WordFrequencyStatsApache();
16 | 		freqTest.getFreqStats(words);
17 | 
18 | 	}
19 | 	public void getFreqStats(String[] words){
20 | 		Frequency freq = new Frequency();
21 | 		for( int i = 0; i < words.length; i++) {
22 | 			freq.addValue(words[i].trim());
23 | 		}
24 | 
25 | 		for( int i = 0; i < words.length; i++) {
26 | 			System.out.println(words[i] + "=" + freq.getCount(words[i]));
27 | 		}
28 | 	}
29 | }
30 | 


--------------------------------------------------------------------------------
/Chapter03/chap3/java/science/data/WordFrequencyStatsJava.java:
--------------------------------------------------------------------------------
 1 | package chap3.java.science.data;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.stream.Collectors;
 5 | import java.util.stream.Stream;
 6 | 
 7 | public class WordFrequencyStatsJava {
 8 | 	public static void main(String[] args){
 9 | 		String str = "Horatio says 'tis but our fantasy, "
10 | 				+ "And will not let belief take hold of him "
11 | 				+ "Touching this dreaded sight, twice seen of us. "
12 | 				+ "Therefore I have entreated him along, 35"
13 | 				+ "With us to watch the minutes of this night, "
14 | 				+ "That, if again this apparition come, "
15 | 				+ "He may approve our eyes and speak to it.";
16 | 		
17 | 		WordFrequencyStatsJava freqTest = new WordFrequencyStatsJava();
18 | 		freqTest.getFreqStats(str);
19 | 	}
20 | 	public void getFreqStats(String str){
21 | 		Stream<String> stream = Stream.of(str.toLowerCase().split("\\W+")).parallel();
22 | 		Map<String, Long> wordFreq = stream
23 | 				.collect(Collectors.groupingBy(String::toString,Collectors.counting()));
24 | 		wordFreq.forEach((k,v)->System.out.println(k + "=" + v));
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/Chapter04/Code/B05916_04_01.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Java-Data-Science-Cookbook/b2bf9ef367bf6c04a96e24123e4160b733b7fed9/Chapter04/Code/B05916_04_01.png


--------------------------------------------------------------------------------
/Chapter04/Code/B05916_04_02.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Java-Data-Science-Cookbook/b2bf9ef367bf6c04a96e24123e4160b733b7fed9/Chapter04/Code/B05916_04_02.png


--------------------------------------------------------------------------------
/Chapter04/Code/WekaArffTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import java.io.BufferedWriter;
 4 | import java.io.FileWriter;
 5 | import java.util.ArrayList;
 6 | 
 7 | import weka.core.Attribute;
 8 | import weka.core.DenseInstance;
 9 | import weka.core.Instances;
10 | 
11 | public class WekaArffTest {
12 | 	public static void main(String[] args) throws Exception {
13 | 		ArrayList<Attribute>      attributes;
14 | 		ArrayList<String>      classVals;
15 | 		Instances       data;
16 | 		double[]        values;
17 | 
18 | 		// Set up attributes
19 | 		attributes = new ArrayList<Attribute>();
20 | 		// Numeric attribute
21 | 		attributes.add(new Attribute("age"));
22 | 		// String attribute
23 | 		ArrayList<String> empty = null;
24 | 		attributes.add(new Attribute("name", empty));
25 | 		// Date attribute
26 | 		attributes.add(new Attribute("dob", "yyyy-MM-dd"));
27 | 		classVals = new ArrayList<String>();
28 | 		for (int i = 0; i < 5; i++){
29 | 			classVals.add("class" + (i + 1));
30 | 		}
31 | 		Attribute classVal = new Attribute("class", classVals);
32 | 		attributes.add(classVal);
33 | 
34 | 		// Create Instances object
35 | 		data = new Instances("MyRelation", attributes, 0);
36 | 
37 | 		// Data fill up
38 | 		// First instance
39 | 		values = new double[data.numAttributes()];
40 | 		values[0] = 35;
41 | 		values[1] = data.attribute(1).addStringValue("John Doe");
42 | 		values[2] = data.attribute(2).parseDate("1981-01-20");
43 | 		values[3] = classVals.indexOf("class3");
44 | 
45 | 		// add
46 | 		data.add(new DenseInstance(1.0, values));
47 | 
48 | 		// Second instance
49 | 		values = new double[data.numAttributes()];  // important: needs NEW array!
50 | 		values[0] = 30;
51 | 		values[1] = data.attribute(1).addStringValue("Harry Potter");
52 | 		values[2] = data.attribute(2).parseDate("1986-07-05");
53 | 		values[3] = classVals.indexOf("class1");
54 | 
55 | 		// add
56 | 		data.add(new DenseInstance(1.0, values));
57 | 
58 | 		//writing arff file to disk
59 | 		BufferedWriter writer = new BufferedWriter(new FileWriter("c:/training.arff"));
60 | 		writer.write(data.toString());
61 | 		writer.close();
62 | 
63 | 		// Output data
64 | 		System.out.println(data);
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaAssociationRuleTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import weka.associations.Apriori;
 4 | import weka.core.Instances;
 5 | import weka.core.converters.ConverterUtils.DataSource;
 6 | 
 7 | public class WekaAssociationRuleTest {
 8 | 	Instances superMarket = null;
 9 | 	Apriori apriori;
10 | 	public void loadArff(String arffInput){
11 | 		DataSource source = null;
12 | 		try {
13 | 			source = new DataSource(arffInput);
14 | 			superMarket = source.getDataSet();
15 | 		} catch (Exception e1) {
16 | 		}
17 | 	}
18 | 	public void generateRule(){
19 | 		apriori = new Apriori();
20 | 		try {
21 | //			apriori.setNumRules(20);
22 | 			apriori.buildAssociations(superMarket);
23 | 			System.out.println(apriori);
24 | 		} catch (Exception e) {
25 | 		}
26 | 	}
27 | 	public static void main(String args[]){
28 | 		WekaAssociationRuleTest test = new WekaAssociationRuleTest();
29 | 		test.loadArff("C:\\Program Files\\Weka-3-6\\data\\supermarket.arff");
30 | 		test.generateRule();
31 | 	}
32 | }


--------------------------------------------------------------------------------
/Chapter04/Code/WekaCVTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import java.util.Random;
 4 | 
 5 | import weka.classifiers.Evaluation;
 6 | import weka.classifiers.bayes.NaiveBayes;
 7 | import weka.core.Instances;
 8 | import weka.core.converters.ConverterUtils.DataSource;
 9 | 
10 | public class WekaCVTest {
11 | 	Instances iris = null;
12 | 	NaiveBayes nb;
13 | 
14 | 	public void loadArff(String arffInput){
15 | 		DataSource source = null;
16 | 		try {
17 | 			source = new DataSource(arffInput);
18 | 			iris = source.getDataSet();
19 | 			if (iris.classIndex() == -1)
20 | 				iris.setClassIndex(iris.numAttributes() - 1);
21 | 		} catch (Exception e1) {
22 | 		}
23 | 	}
24 | 
25 | 	public void generateModel(){
26 | 		nb = new NaiveBayes();
27 | 		try {
28 | 			nb.buildClassifier(iris);
29 | 		} catch (Exception e) {
30 | 
31 | 		}
32 | 	}
33 | 
34 | 	public void saveModel(String modelPath){
35 | 		try {
36 | 			weka.core.SerializationHelper.write(modelPath, nb);
37 | 		} catch (Exception e) {
38 | 		}
39 | 	}
40 | 
41 | 	public void crossValidate(){
42 | 		Evaluation eval = null;
43 | 		try {
44 | 			eval = new Evaluation(iris);
45 | 			eval.crossValidateModel(nb, iris, 10, new Random(1));
46 | 			System.out.println(eval.toSummaryString());
47 | 		} catch (Exception e1) {
48 | 		}	
49 | 	}
50 | 	
51 | 	public static void main(String[] args){
52 | 		WekaCVTest test = new WekaCVTest();
53 | 		test.loadArff("C:/Program Files/Weka-3-6/data/iris.arff");
54 | 		test.generateModel();
55 | 		test.saveModel("c:/nb.model");
56 | 		test.crossValidate();
57 | 	}
58 | }
59 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaClassesToClusterTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import weka.clusterers.ClusterEvaluation;
 4 | import weka.clusterers.EM;
 5 | import weka.core.Instances;
 6 | import weka.core.converters.ConverterUtils.DataSource;
 7 | import weka.filters.Filter;
 8 | import weka.filters.unsupervised.attribute.Remove;
 9 | 
10 | public class WekaClassesToClusterTest {
11 | 	Instances weather = null;
12 | 	EM clusterer;
13 | 
14 | 	public void loadArff(String arffInput){
15 | 		DataSource source = null;
16 | 		try {
17 | 			source = new DataSource(arffInput);
18 | 			weather = source.getDataSet();
19 | 			weather.setClassIndex(weather.numAttributes() - 1);
20 | 		} catch (Exception e1) {
21 | 		}
22 | 	}
23 | 
24 | 	public void generateClassToCluster(){
25 | 		Remove filter = new Remove();
26 | 		filter.setAttributeIndices("" + (weather.classIndex() + 1));
27 | 		try {
28 | 			filter.setInputFormat(weather);
29 | 			Instances dataClusterer = Filter.useFilter(weather, filter);
30 | 			clusterer = new EM();
31 | 			clusterer.buildClusterer(dataClusterer);
32 | 			ClusterEvaluation eval = new ClusterEvaluation();
33 | 			eval.setClusterer(clusterer);
34 | 			eval.evaluateClusterer(weather);
35 | 
36 | 			System.out.println(eval.clusterResultsToString());
37 | 		} catch (Exception e) {
38 | 		}
39 | 	}
40 | 	
41 | 	public static void main(String[] args){
42 | 		WekaClassesToClusterTest test = new WekaClassesToClusterTest();
43 | 		test.loadArff("C:/Program Files/Weka-3-6/data/weather.nominal.arff");
44 | 		test.generateClassToCluster();
45 | 	}
46 | }


--------------------------------------------------------------------------------
/Chapter04/Code/WekaClusterTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import weka.clusterers.SimpleKMeans;
 4 | import weka.core.Instances;
 5 | import weka.core.converters.ConverterUtils.DataSource;
 6 | 
 7 | public class WekaClusterTest {
 8 | 	Instances cpu = null;
 9 | 	SimpleKMeans kmeans;
10 | 
11 | 	public void loadArff(String arffInput){
12 | 		DataSource source = null;
13 | 		try {
14 | 			source = new DataSource(arffInput);
15 | 			cpu = source.getDataSet();
16 | 		} catch (Exception e1) {
17 | 		}
18 | 	}
19 | 
20 | 	public void clusterData(){	
21 | 		kmeans = new SimpleKMeans();
22 | 		kmeans.setSeed(10);
23 | 		try {
24 | 			kmeans.setPreserveInstancesOrder(true);
25 | 			kmeans.setNumClusters(10);
26 | 			kmeans.buildClusterer(cpu);
27 | 			int[] assignments = kmeans.getAssignments();
28 | 			int i = 0;
29 | 			for(int clusterNum : assignments) {
30 | 				System.out.printf("Instance %d -> Cluster %d\n", i, clusterNum);
31 | 				i++;
32 | 			}
33 | 		} catch (Exception e1) {
34 | 		}
35 | 	}
36 | 
37 | 	public static void main(String[] args) throws Exception{
38 | 		WekaClusterTest test = new WekaClusterTest();
39 | 		test.loadArff("C:\\Program Files\\Weka-3-6\\data\\cpu.arff");
40 | 		test.clusterData();
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaFeatureSelectionTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import java.util.Random;
 4 | 
 5 | import weka.attributeSelection.AttributeSelection;
 6 | import weka.attributeSelection.BestFirst;
 7 | import weka.attributeSelection.CfsSubsetEval;
 8 | import weka.classifiers.Evaluation;
 9 | import weka.classifiers.bayes.NaiveBayes;
10 | import weka.classifiers.meta.AttributeSelectedClassifier;
11 | import weka.core.Instances;
12 | import weka.core.Utils;
13 | import weka.core.converters.ConverterUtils.DataSource;
14 | import weka.filters.Filter;
15 | 
16 | public class WekaFeatureSelectionTest {
17 | 	Instances iris = null;
18 | 	NaiveBayes nb;
19 | 	public void loadArff(String arffInput){
20 | 		DataSource source = null;
21 | 		try {
22 | 			source = new DataSource(arffInput);
23 | 			iris = source.getDataSet();
24 | 			iris.setClassIndex(iris.numAttributes() - 1);
25 | 		} catch (Exception e1) {
26 | 		}
27 | 	}
28 | 
29 | 	public void selectFeatures(){
30 | 		AttributeSelection attSelection = new AttributeSelection();
31 | 	    CfsSubsetEval eval = new CfsSubsetEval();
32 | 	    BestFirst search = new BestFirst();
33 | 	    attSelection.setEvaluator(eval);
34 | 	    attSelection.setSearch(search);
35 | 	    try {
36 | 			attSelection.SelectAttributes(iris);
37 | 			int[] attIndex = attSelection.selectedAttributes();
38 | 			System.out.println(Utils.arrayToString(attIndex));
39 | 		} catch (Exception e) {
40 | 		}
41 | 	}
42 | 
43 | 	public void selectFeaturesWithFilter(){
44 | 		weka.filters.supervised.attribute.AttributeSelection filter = new weka.filters.supervised.attribute.AttributeSelection();
45 | 	    CfsSubsetEval eval = new CfsSubsetEval();
46 | 	    BestFirst search = new BestFirst();
47 | 	    filter.setEvaluator(eval);
48 | 	    filter.setSearch(search);
49 | 	    try {
50 | 			filter.setInputFormat(iris);
51 | 			Instances newData = Filter.useFilter(iris, filter);
52 | 			System.out.println(newData);
53 | 		} catch (Exception e) {
54 | 		}
55 | 	}
56 | 	
57 | 	public void selectFeaturesWithClassifiers(){
58 | 		AttributeSelectedClassifier classifier = new AttributeSelectedClassifier();
59 | 		CfsSubsetEval eval = new CfsSubsetEval();
60 | 		BestFirst search = new BestFirst();
61 | 		nb = new NaiveBayes();
62 | 		classifier.setClassifier(nb);
63 | 		classifier.setEvaluator(eval);
64 | 		classifier.setSearch(search);
65 | 		Evaluation evaluation;
66 | 		try {
67 | 			evaluation = new Evaluation(iris);
68 | 			evaluation.crossValidateModel(classifier, iris, 10, new Random(1));
69 | 			System.out.println(evaluation.toSummaryString());
70 | 		} catch (Exception e) {
71 | 		}
72 | 	}
73 | 	
74 | 	public static void main(String[] args){
75 | 		WekaFeatureSelectionTest test = new WekaFeatureSelectionTest();
76 | 		test.loadArff("C:/Program Files/Weka-3-6/data/iris.arff");
77 | 		test.selectFeatures();
78 | 		test.selectFeaturesWithFilter();
79 | 		test.selectFeaturesWithClassifiers();
80 | 	}
81 | }
82 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaFilteredClassifierTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import weka.classifiers.meta.FilteredClassifier;
 4 | import weka.classifiers.trees.RandomForest;
 5 | import weka.core.Instances;
 6 | import weka.core.converters.ConverterUtils.DataSource;
 7 | import weka.filters.unsupervised.attribute.Remove;
 8 | 
 9 | 
10 | public class WekaFilteredClassifierTest {
11 | 	Instances weather = null;
12 | 	RandomForest rf;
13 | 
14 | 	public void loadArff(String arffInput){
15 | 		DataSource source = null;
16 | 		try {
17 | 			source = new DataSource(arffInput);
18 | 			weather = source.getDataSet();
19 | 			weather.setClassIndex(weather.numAttributes() - 1);
20 | 		} catch (Exception e1) {
21 | 		}
22 | 	}
23 | 
24 | 	public void buildFilteredClassifier(){
25 | 		rf = new RandomForest();
26 | 		Remove rm = new Remove();
27 | 		rm.setAttributeIndices("1");
28 | 		FilteredClassifier fc = new FilteredClassifier();
29 | 		fc.setFilter(rm);
30 | 		fc.setClassifier(rf);
31 | 		try{
32 | 			fc.buildClassifier(weather);
33 | 			for (int i = 0; i < weather.numInstances(); i++){
34 | 				double pred = fc.classifyInstance(weather.instance(i));
35 | 				System.out.print("given value: " + weather.classAttribute().value((int) weather.instance(i).classValue()));
36 | 				System.out.println("---predicted value: " + weather.classAttribute().value((int) pred));
37 | 			}
38 | 		} catch (Exception e) {
39 | 		}
40 | 	}
41 | 	
42 | 	public static void main(String[] args){
43 | 		WekaFilteredClassifierTest test = new WekaFilteredClassifierTest();
44 | 		test.loadArff("C:/Program Files/Weka-3-6/data/weather.nominal.arff");
45 | 		test.buildFilteredClassifier();
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaLinearRegressionTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import weka.classifiers.functions.LinearRegression;
 4 | import weka.core.Instances;
 5 | import weka.core.converters.ConverterUtils.DataSource;
 6 | 
 7 | public class WekaLinearRegressionTest {
 8 | 	Instances cpu = null;
 9 | 	LinearRegression lReg ;
10 | 
11 | 	public void loadArff(String arffInput){
12 | 		DataSource source = null;
13 | 		try {
14 | 			source = new DataSource(arffInput);
15 | 			cpu = source.getDataSet();
16 | 			cpu.setClassIndex(cpu.numAttributes() - 1);
17 | 		} catch (Exception e1) {
18 | 		}
19 | 	}
20 | 
21 | 	public void buildRegression(){	
22 | 		lReg = new LinearRegression();
23 | 		try {
24 | 			lReg.buildClassifier(cpu);
25 | 		} catch (Exception e) {
26 | 		} 
27 | 		System.out.println(lReg);
28 | 	}
29 | 
30 | 	public static void main(String[] args) throws Exception{
31 | 		WekaLinearRegressionTest test = new WekaLinearRegressionTest();
32 | 		test.loadArff("C:\\Program Files\\Weka-3-6\\data\\cpu.arff");
33 | 		test.buildRegression();
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaLogisticRegressionTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import weka.classifiers.functions.Logistic;
 4 | import weka.core.Instances;
 5 | import weka.core.converters.ConverterUtils.DataSource;
 6 | 
 7 | public class WekaLogisticRegressionTest {
 8 | 	Instances iris = null;
 9 | 	Logistic logReg ;
10 | 
11 | 	public void loadArff(String arffInput){
12 | 		DataSource source = null;
13 | 		try {
14 | 			source = new DataSource(arffInput);
15 | 			iris = source.getDataSet();
16 | 			iris.setClassIndex(iris.numAttributes() - 1);
17 | 		} catch (Exception e1) {
18 | 		}
19 | 	}
20 | 
21 | 	public void buildRegression(){	
22 | 		logReg = new Logistic();
23 | 
24 | 		try {
25 | 			logReg.buildClassifier(iris);
26 | 		} catch (Exception e) {
27 | 		} 
28 | 		System.out.println(logReg);
29 | 	}
30 | 
31 | 	public static void main(String[] args) throws Exception{
32 | 		WekaLogisticRegressionTest test = new WekaLogisticRegressionTest();
33 | 		test.loadArff("C:\\Program Files\\Weka-3-6\\data\\iris.arff");
34 | 		test.buildRegression();
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/Chapter04/Code/WekaTrainTest.java:
--------------------------------------------------------------------------------
 1 | package chap4.java.science.data;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.FileReader;
 6 | import java.io.FileWriter;
 7 | import java.io.IOException;
 8 | 
 9 | import weka.classifiers.bayes.NaiveBayes;
10 | import weka.core.Instances;
11 | 
12 | public class WekaTrainTest {
13 | 	NaiveBayes nb;
14 | 	Instances train, test, labeled;
15 | 	
16 | 	public void loadModel(String modelPath){
17 | 		try {
18 | 			nb = (NaiveBayes) weka.core.SerializationHelper.read(modelPath);
19 | 		} catch (Exception e) {
20 | 		}
21 | 	}
22 | 	
23 | 	public void loadDatasets(String training, String testing){
24 | 		BufferedReader reader = null;
25 | 		try {
26 | 			reader = new BufferedReader(new FileReader(training));
27 | 			train = new Instances (reader);
28 | 			train.setClassIndex(train.numAttributes() -1);
29 | 		} catch (IOException e) {
30 | 		}
31 | 
32 | 
33 | 		try {
34 | 			reader = new BufferedReader(new FileReader(testing));
35 | 			test = new Instances (reader);
36 | 			test.setClassIndex(train.numAttributes() -1);
37 | 		} catch (IOException e) {
38 | 		}
39 | 
40 | 		try {
41 | 			reader.close();
42 | 		} catch (IOException e) {
43 | 		}
44 | 	}
45 | 
46 | 	public void classify(){
47 | 		try {
48 | 			nb.buildClassifier(train);
49 | 		} catch (Exception e) {
50 | 		}
51 | 
52 | 		labeled = new Instances(test);
53 | 
54 | 		for (int i = 0; i < test.numInstances(); i++) {
55 | 			double clsLabel;
56 | 			try {
57 | 				clsLabel = nb.classifyInstance(test.instance(i));
58 | 				labeled.instance(i).setClassValue(clsLabel);
59 | 				double[] predictionOutput = nb.distributionForInstance(test.instance(i));
60 | 				double predictionProbability = predictionOutput[1];
61 | 				System.out.println(predictionProbability);
62 | 			} catch (Exception e) {
63 | 			}
64 | 		}
65 | 	}
66 | 
67 | 	public void writeArff(String outArff){
68 | 		BufferedWriter writer;
69 | 		try {
70 | 			writer = new BufferedWriter(new FileWriter(outArff));
71 | 			writer.write(labeled.toString());
72 | 			writer.close();
73 | 		} catch (IOException e) {
74 | 		}
75 | 	}
76 | 	
77 | 	public static void main(String[] args) throws Exception{
78 | 		WekaTrainTest test = new WekaTrainTest();
79 | 		test.loadModel("c:/nb.model");
80 | 		test.loadDatasets("C:\\Program Files\\Weka-3-8\\data\\iris.arff", "c:\\iris-test.arff");
81 | 		test.classify();
82 | 		test.writeArff("c:/out.arff");
83 | 	}
84 | }
85 | 


--------------------------------------------------------------------------------
/Chapter05/chapter-5/JavaMachineLearning.java:
--------------------------------------------------------------------------------
 1 | package chap5.java.science.data;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.Map;
 6 | 
 7 | import net.sf.javaml.classification.Classifier;
 8 | import net.sf.javaml.classification.KNearestNeighbors;
 9 | import net.sf.javaml.classification.evaluation.CrossValidation;
10 | import net.sf.javaml.classification.evaluation.EvaluateDataset;
11 | import net.sf.javaml.classification.evaluation.PerformanceMeasure;
12 | import net.sf.javaml.clustering.Clusterer;
13 | import net.sf.javaml.clustering.KMeans;
14 | import net.sf.javaml.clustering.evaluation.ClusterEvaluation;
15 | import net.sf.javaml.clustering.evaluation.SumOfSquaredErrors;
16 | import net.sf.javaml.core.Dataset;
17 | import net.sf.javaml.distance.PearsonCorrelationCoefficient;
18 | import net.sf.javaml.featureselection.ranking.RecursiveFeatureEliminationSVM;
19 | import net.sf.javaml.featureselection.scoring.GainRatio;
20 | import net.sf.javaml.featureselection.subset.GreedyForwardSelection;
21 | import net.sf.javaml.tools.data.FileHandler;
22 | 
23 | public class JavaMachineLearning {
24 | 	public static void main(String[] args) throws IOException{
25 | 		Dataset data = FileHandler.loadDataset(new File("datasets/UCI-small/iris/iris.data"), 4, ",");
26 | 		System.out.println(data);
27 | 		FileHandler.exportDataset(data, new File("c:/javaml-output.txt"));
28 | 		data = FileHandler.loadDataset(new File("c:/javaml-output.txt"), 0,"\t");
29 | 		System.out.println(data);
30 | 		
31 | 		//Clustering
32 | 		Clusterer km = new KMeans();
33 | 		Dataset[] clusters = km.cluster(data);
34 | 		for(Dataset cluster:clusters){
35 | 			System.out.println("Cluster: " + cluster);
36 | 		}
37 | 		ClusterEvaluation sse= new SumOfSquaredErrors();
38 | 		double score = sse.score(clusters);
39 | 		System.out.println(score);
40 | 		
41 | 		//Classification
42 | 		Classifier knn = new KNearestNeighbors(5);
43 | 		knn.buildClassifier(data);
44 | 		//Cross validation
45 | 		CrossValidation cv = new CrossValidation(knn);
46 | 		Map<Object, PerformanceMeasure> cvEvaluation = cv.crossValidation(data);
47 | 		System.out.println(cvEvaluation + "---------");
48 | 		//Held-out testing
49 | 		Dataset testData = FileHandler.loadDataset(new File("datasets/UCI-small/iris/iris.data"), 4, ",");
50 | 		Map<Object, PerformanceMeasure> testEvaluation =
51 | 				EvaluateDataset.testDataset(knn, testData);
52 | 		for(Object classVariable:testEvaluation.keySet()){
53 | 			System.out.println(classVariable + " class has "+testEvaluation.get(classVariable).getAccuracy());
54 | 		}
55 | 		
56 | 		//Feature scoring
57 | 		GainRatio gainRatio = new GainRatio();
58 | 		gainRatio.build(data);
59 | 		for (int i = 0; i < gainRatio.noAttributes(); i++){
60 | 			System.out.println(gainRatio.score(i));
61 | 		}
62 | 		
63 | 		//Feature ranking
64 | 		RecursiveFeatureEliminationSVM featureRank = new RecursiveFeatureEliminationSVM(0.2);
65 | 		featureRank.build(data);
66 | 		for (int i = 0; i < featureRank.noAttributes(); i++){
67 | 			System.out.println(featureRank.rank(i));
68 | 		}
69 | 		
70 | 		//Feature subset selection
71 | 		GreedyForwardSelection featureSelection = new GreedyForwardSelection(5, new PearsonCorrelationCoefficient());
72 | 		featureSelection.build(data);
73 | 		System.out.println(featureSelection.selectedAttributes());
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------
/Chapter05/chapter-5/MOA.java:
--------------------------------------------------------------------------------
 1 | package chap5.java.science.data;
 2 | 
 3 | import moa.classifiers.trees.HoeffdingTree;
 4 | import moa.classifiers.Classifier;
 5 | import moa.core.TimingUtils;
 6 | import moa.streams.generators.RandomRBFGenerator;
 7 | import com.yahoo.labs.samoa.instances.Instance;
 8 | import java.io.IOException;
 9 | 
10 | 
11 | public class MOA {
12 | 
13 | 	public void run(int numInstances, boolean isTesting){
14 | 		Classifier learner = new HoeffdingTree();
15 | 		RandomRBFGenerator stream = new RandomRBFGenerator();
16 | 		stream.prepareForUse();
17 | 
18 | 		learner.setModelContext(stream.getHeader());
19 | 		learner.prepareForUse();
20 | 
21 | 		int numberSamplesCorrect = 0;
22 | 		int numberSamples = 0;
23 | 		long evaluateStartTime = TimingUtils.getNanoCPUTimeOfCurrentThread();
24 | 		while (stream.hasMoreInstances() && numberSamples < numInstances) {
25 | 			Instance trainInst = stream.nextInstance().getData();
26 | 			if (isTesting) {
27 | 				if (learner.correctlyClassifies(trainInst)){
28 | 					numberSamplesCorrect++;
29 | 				}
30 | 			}
31 | 			numberSamples++;
32 | 			learner.trainOnInstance(trainInst);
33 | 		}
34 | 		double accuracy = 100.0 * (double) numberSamplesCorrect/ (double) numberSamples;
35 | 		double time = TimingUtils.nanoTimeToSeconds(TimingUtils.getNanoCPUTimeOfCurrentThread()- evaluateStartTime);
36 | 		System.out.println(numberSamples + " instances processed with " + accuracy + "% accuracy in "+time+" seconds.");
37 | 	}
38 | 
39 | 	public static void main(String[] args) throws IOException {
40 | 		MOA exp = new MOA();
41 | 		exp.run(1000000, true);
42 | 	}
43 | }


--------------------------------------------------------------------------------
/Chapter05/chapter-5/Mulan.java:
--------------------------------------------------------------------------------
 1 | import mulan.classifier.lazy.MLkNN;
 2 | import mulan.classifier.meta.RAkEL;
 3 | import mulan.classifier.transformation.LabelPowerset;
 4 | import mulan.data.InvalidDataFormatException;
 5 | import mulan.data.MultiLabelInstances;
 6 | import mulan.evaluation.Evaluator;
 7 | import mulan.evaluation.MultipleEvaluation;
 8 | import weka.classifiers.trees.J48;
 9 | 
10 | public class Mulan {
11 | 	public static void main(String[] args){
12 | 		MultiLabelInstances dataset = null;
13 | 		try {
14 | 			dataset = new MultiLabelInstances("F:\\mulan-1.5.0\\mulan\\data\\emotions.arff", "F:\\mulan-1.5.0\\mulan\\data\\emotions.xml");
15 | 		} catch (InvalidDataFormatException e) {
16 | 		}
17 | 		RAkEL learner1 = new RAkEL(new LabelPowerset(new J48()));
18 | 		MLkNN learner2 = new MLkNN(); 
19 | 		Evaluator eval = new Evaluator();
20 | 		MultipleEvaluation results;
21 | 		int numFolds = 10;
22 | 		results = eval.crossValidate(learner1, dataset, numFolds);
23 | 		System.out.println(results);
24 | 		results = eval.crossValidate(learner2, dataset, numFolds);
25 | 		System.out.println(results);
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/Chapter05/chapter-5/StanfordClassifier.java:
--------------------------------------------------------------------------------
 1 | package chap5.java.science.data;
 2 | 
 3 | import edu.stanford.nlp.classify.Classifier;
 4 | import edu.stanford.nlp.classify.ColumnDataClassifier;
 5 | import edu.stanford.nlp.ling.Datum;
 6 | import edu.stanford.nlp.objectbank.ObjectBank;
 7 | 
 8 | public class StanfordClassifier {
 9 |   public static void main(String[] args) throws Exception {
10 |     ColumnDataClassifier columnDataClassifier = new ColumnDataClassifier("examples/cheese2007.prop");
11 |     Classifier<String,String> classifier =
12 |         columnDataClassifier.makeClassifier(columnDataClassifier.readTrainingExamples("examples/cheeseDisease.train"));
13 |     for (String line : ObjectBank.getLineIterator("examples/cheeseDisease.test", "utf-8")) {
14 |       Datum<String,String> d = columnDataClassifier.makeDatumFromLine(line);
15 |       System.out.println(line + "  ==>  " + classifier.classOf(d));
16 |     }
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/Chapter06/CosineSimilarity.java:
--------------------------------------------------------------------------------
 1 | package chap6.java.science.data;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Map;
 5 | import java.util.Set;
 6 | import java.util.stream.Collectors;
 7 | import java.util.stream.Stream;
 8 | 
 9 | public class CosineSimilarity {
10 | 	public double calculateCosine(String s1, String s2){
11 | 		//tokenization in parallel with Java 8
12 | 		Stream<String> stream1 = Stream.of(s1.toLowerCase().split("\\W+")).parallel();
13 | 		Stream<String> stream2 = Stream.of(s2.toLowerCase().split("\\W+")).parallel();
14 | 		
15 | 		//word frequency maps for two strings
16 | 		Map<String, Long> wordFreq1 = stream1
17 | 		     .collect(Collectors.groupingBy(String::toString,Collectors.counting()));
18 | 		Map<String, Long> wordFreq2 = stream2
19 | 			     .collect(Collectors.groupingBy(String::toString,Collectors.counting()));
20 | 		
21 | 		//unique words for each string
22 | 		Set<String> wordSet1 = wordFreq1.keySet();
23 | 		Set<String> wordSet2 = wordFreq2.keySet();
24 | 		
25 | 		//common words of two strings
26 | 		Set<String> intersection = new HashSet<String>(wordSet1);
27 | 		intersection.retainAll(wordSet2);
28 | 		
29 | 		//numerator of cosine formula. s1.s2
30 | 		double numerator = 0;
31 | 		for (String common: intersection){
32 | 			numerator += wordFreq1.get(common) * wordFreq2.get(common);
33 | 		}
34 | 		
35 | 		//denominator of cosine formula has two parameters
36 | 		double param1 = 0, param2 = 0;
37 | 		
38 | 		//sqrt (sum of squared of s1 word frequencies)
39 | 		for(String w1: wordSet1){
40 | 			param1 += Math.pow(wordFreq1.get(w1), 2);
41 | 		}
42 | 		param1 = Math.sqrt(param1);
43 | 		
44 | 		//sqrt (sum of squared of s2 word frequencies)
45 | 		for(String w2: wordSet2){
46 | 			param2 += Math.pow(wordFreq2.get(w2), 2);
47 | 		}
48 | 		param2 = Math.sqrt(param2);
49 | 		
50 | 		//denominator of cosine formula. sqrt(sum(s1^2)) X sqrt(sum(s2^2))
51 | 		double denominator = param1 * param2;
52 | 		
53 | 		//cosine measure
54 | 		double cosineSimilarity = numerator/denominator;
55 | 		return cosineSimilarity;
56 | 	}//end method to calculate cosine similarity of two strings
57 | 	
58 | 	public static void main(String[] args){
59 | 		CosineSimilarity cos = new CosineSimilarity();
60 | 		System.out.println(cos.calculateCosine("To be, or not to be: that is the question.", "Frailty, thy name is woman!"));
61 | 		System.out.println(cos.calculateCosine("The lady doth protest too much, methinks.", "Frailty, thy name is woman!"));
62 | 	}
63 | }
64 | 


--------------------------------------------------------------------------------
/Chapter06/Lemmatizer.java:
--------------------------------------------------------------------------------
 1 | package chap6.java.science.data;
 2 | 
 3 | import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; 
 4 | import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
 5 | import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
 6 | import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; 
 7 | import edu.stanford.nlp.ling.CoreLabel; 
 8 | import edu.stanford.nlp.pipeline.Annotation; 
 9 | import edu.stanford.nlp.pipeline.StanfordCoreNLP; 
10 | import edu.stanford.nlp.util.CoreMap; 
11 | import java.util.ArrayList; 
12 | import java.util.List; 
13 | import java.util.Properties; 
14 | /**
15 |  * Class to perform lemmatization using Stanford Core NLP 
16 |  * @author Themistoklis Mavridis 
17 |  */ 
18 | public class Lemmatizer { 
19 |      
20 |      
21 |     public static void main(String[] args)
22 |     {
23 |     	StanfordCoreNLP pipeline; 
24 |         Properties props = new Properties(); 
25 |         props.put("annotators", "tokenize, ssplit, pos, lemma"); 
26 |         pipeline = new StanfordCoreNLP(props, false);
27 |         String text = "This is a test string"; 
28 |         Annotation document = pipeline.process(text);  
29 | 
30 |         for(CoreMap sentence: document.get(SentencesAnnotation.class))
31 |         {    
32 |             for(CoreLabel token: sentence.get(TokensAnnotation.class))
33 |             {       
34 |                 String word = token.get(TextAnnotation.class);      
35 |                 String lemma = token.get(LemmaAnnotation.class); 
36 |                 System.out.println("lemmatized version :" + lemma);
37 |             }
38 |         }
39 |     }
40 | }


--------------------------------------------------------------------------------
/Chapter06/OpenNlpSenToken.java:
--------------------------------------------------------------------------------
 1 | package chap6.java.science.data;
 2 | 
 3 | import java.io.FileInputStream;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | 
 7 | import opennlp.tools.sentdetect.SentenceDetectorME;
 8 | import opennlp.tools.sentdetect.SentenceModel;
 9 | import opennlp.tools.tokenize.Tokenizer;
10 | import opennlp.tools.tokenize.TokenizerME;
11 | import opennlp.tools.tokenize.TokenizerModel;
12 | 
13 | public class OpenNlpSenToken {
14 | 	public static void main(String[] args){
15 | 		OpenNlpSenToken openNlp = new OpenNlpSenToken();
16 | 		try {
17 | 			openNlp.useOpenNlp("My name is Rushdi Shams. "
18 | 					+ "You can use Dr. before my name as I have a Ph.D. "
19 | 					+ "but I am a bit shy to use it.", "opennlp-models/en-sent.bin", "sentence");
20 | 			openNlp.useOpenNlp("\"Let's get this vis-a-vis\", he said, \"these boys' marks are really that well?\"", "opennlp-models/en-token.bin", "word");
21 | 		} catch (IOException e) {
22 | 		}
23 | 	}
24 | 	public void useOpenNlp(String sourceText, String modelPath, String choice) throws IOException{
25 | 		InputStream modelIn = null;
26 | 		modelIn = new FileInputStream(modelPath);
27 | 
28 | 		if(choice.equalsIgnoreCase("sentence")){
29 | 			SentenceModel model = new SentenceModel(modelIn);
30 | 			modelIn.close();
31 | 			SentenceDetectorME sentenceDetector = new SentenceDetectorME(model);
32 | 			String sentences[] = sentenceDetector.sentDetect(sourceText);
33 | 			System.out.println("Sentences: ");
34 | 			for(String sentence:sentences){
35 | 				System.out.println(sentence);
36 | 			}
37 | 		}
38 | 		else if(choice.equalsIgnoreCase("word")){
39 | 			TokenizerModel model = new TokenizerModel(modelIn);
40 | 			modelIn.close();
41 | 			Tokenizer tokenizer = new TokenizerME(model);
42 | 			String tokens[] = tokenizer.tokenize(sourceText);
43 | 			System.out.println("Words: ");
44 | 			for(String token:tokens){
45 | 				System.out.println(token);
46 | 			}
47 | 		}
48 | 		else{
49 | 			System.out.println("Error in choice");
50 | 			modelIn.close();
51 | 			return;
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/Chapter06/SentenceDetection.java:
--------------------------------------------------------------------------------
 1 | package chap6.java.science.data;
 2 | 
 3 | import java.text.BreakIterator;
 4 | import java.util.Locale;
 5 | 
 6 | public class SentenceDetection {
 7 | 	public void useSentenceIterator(String source){
 8 | 		BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US);
 9 | 		iterator.setText(source);
10 | 		int start = iterator.first();
11 | 		for (int end = iterator.next();
12 | 		    end != BreakIterator.DONE;
13 | 		    start = end, end = iterator.next()) {
14 | 		  System.out.println(source.substring(start,end));
15 | 		}
16 | 	}
17 | 	public static void main(String[] args){
18 | 		SentenceDetection detection = new SentenceDetection();
19 | 		String test = "My name is Rushdi Shams. You can use Dr. before my name as I have a Ph.D. but I am a bit shy to use it.";
20 | 		detection.useSentenceIterator(test);
21 | 	}
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/Chapter06/WekaClassification.java:
--------------------------------------------------------------------------------
 1 | package chap6.java.science.data;
 2 | 
 3 | import weka.core.*;
 4 | import weka.core.converters.*;
 5 | import weka.classifiers.Evaluation;
 6 | import weka.classifiers.bayes.NaiveBayes;
 7 | import weka.filters.*;
 8 | import weka.filters.unsupervised.attribute.*;
 9 | 
10 | import java.io.*;
11 | import java.util.Random;
12 | 
13 | public class WekaClassification {
14 | 	public static void main(String[] args) throws Exception {
15 | 		TextDirectoryLoader loader = new TextDirectoryLoader();
16 | 		loader.setDirectory(new File("C:/text_example"));
17 | 		Instances data = loader.getDataSet();
18 | 
19 | 		StringToWordVector filter = new StringToWordVector();
20 | 		filter.setInputFormat(data);
21 | 		Instances dataFiltered = Filter.useFilter(data, filter);
22 | 
23 | 		NaiveBayes nb = new NaiveBayes();
24 | 		nb.buildClassifier(dataFiltered);
25 | 		System.out.println("\n\nClassifier model:\n\n" + nb);
26 | 
27 | 		Evaluation eval = null;
28 | 		eval = new Evaluation(dataFiltered);
29 | 		eval.crossValidateModel(nb, dataFiltered, 5, new Random(1));
30 | 		System.out.println(eval.toSummaryString());
31 | 	}
32 | }


--------------------------------------------------------------------------------
/Chapter06/WordDetection.java:
--------------------------------------------------------------------------------
 1 | package chap6.java.science.data;
 2 | 
 3 | import java.text.BreakIterator;
 4 | import java.util.StringTokenizer;
 5 | import java.util.regex.Matcher;
 6 | import java.util.regex.Pattern;
 7 | 
 8 | public class WordDetection {
 9 | 	public static void main(String[] args){
10 | 		String input = "\"Let's get this vis-a-vis\", he said, \"these boys' marks are really that well?\"";
11 | 		WordDetection wordDetection = new WordDetection();
12 | 		wordDetection.useTokenizer(input);
13 | 		wordDetection.useBreakIterator(input);
14 | 		wordDetection.useRegEx(input);
15 | 		
16 | 	}
17 | 	
18 | 	public void useTokenizer(String input){
19 | 		System.out.println("Tokenizer");
20 | 		StringTokenizer tokenizer = new StringTokenizer(input);
21 | 		String word ="";
22 | 		while(tokenizer.hasMoreTokens()){
23 | 		    word = tokenizer.nextToken();
24 | 		    System.out.println(word);
25 | 		}
26 | 	}
27 | 	
28 | 	public void useBreakIterator(String input){
29 | 		System.out.println("Break Iterator");
30 | 		BreakIterator tokenizer = BreakIterator.getWordInstance();
31 |         tokenizer.setText(input);
32 |         int start = tokenizer.first();
33 |         for (int end = tokenizer.next();
34 |              end != BreakIterator.DONE;
35 |              start = end, end = tokenizer.next()) {
36 |              System.out.println(input.substring(start,end));
37 |         }
38 | 	}
39 | 	
40 | 	public void useRegEx(String input){
41 | 		System.out.println("Regular Expression");
42 | 		Pattern pattern = Pattern.compile("\\w[\\w-]+('\\w*)?");
43 | 		Matcher matcher = pattern.matcher(input);
44 | 
45 | 		while ( matcher.find() ) {
46 | 		    System.out.println(input.substring(matcher.start(), matcher.end()));
47 | 		}
48 | 	}
49 | }
50 | 
51 | 


--------------------------------------------------------------------------------
/Chapter07/Code/KMeansClusteringMlib.java:
--------------------------------------------------------------------------------
 1 | package com.data.big.mlib;
 2 | 
 3 | import org.apache.spark.api.java.*;
 4 | import org.apache.spark.api.java.function.Function;
 5 | import org.apache.spark.mllib.clustering.KMeans;
 6 | import org.apache.spark.mllib.clustering.KMeansModel;
 7 | import org.apache.spark.mllib.linalg.Vector;
 8 | import org.apache.spark.mllib.linalg.Vectors;
 9 | import org.apache.spark.SparkConf;
10 | 
11 | public class KMeansClusteringMlib {
12 |     public static void main( String[] args ){
13 |     	SparkConf conf = new SparkConf().setMaster("local[4]").setAppName("K-means Example");
14 |         JavaSparkContext sc = new JavaSparkContext(conf);
15 | 
16 |         // Load and parse data
17 |         String path = "data/km-data.txt";
18 |         JavaRDD<String> data = sc.textFile(path);
19 |         JavaRDD<Vector> parsedData = data.map(
20 |           new Function<String, Vector>() {
21 |             public Vector call(String s) {
22 |               String[] sarray = s.split(" ");
23 |               double[] values = new double[sarray.length];
24 |               for (int i = 0; i < sarray.length; i++)
25 |                 values[i] = Double.parseDouble(sarray[i]);
26 |               return Vectors.dense(values);
27 |             }
28 |           }
29 |         );
30 |         parsedData.cache();
31 | 
32 |         // Cluster the data into two classes using KMeans
33 |         int numClusters = 2;
34 |         int numIterations = 20;
35 |         KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations);
36 | 
37 |         // Evaluate clustering by computing Within Set Sum of Squared Errors
38 |         double WSSSE = clusters.computeCost(parsedData.rdd());
39 |         System.out.println("Within Set Sum of Squared Errors = " + WSSSE);
40 |     	
41 |     	
42 |     	
43 |     }
44 | }


--------------------------------------------------------------------------------
/Chapter07/Code/LinearRegressionMlib.java:
--------------------------------------------------------------------------------
 1 | package com.data.big.mlib;
 2 | 
 3 | import scala.Tuple2;
 4 | 
 5 | import org.apache.spark.api.java.*;
 6 | import org.apache.spark.api.java.function.Function;
 7 | import org.apache.spark.mllib.linalg.Vectors;
 8 | import org.apache.spark.mllib.regression.LabeledPoint;
 9 | import org.apache.spark.mllib.regression.LinearRegressionModel;
10 | import org.apache.spark.mllib.regression.LinearRegressionWithSGD;
11 | import org.apache.spark.SparkConf;
12 | 
13 | public class LinearRegressionMlib {
14 | 
15 | 	public static void main(String[] args) {
16 | 		SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Linear Regression Example");
17 | 		JavaSparkContext sparkContext = new JavaSparkContext(configuration);
18 | 
19 | 		// Load and parse the data
20 | 		String inputData = "data/lr-data.txt";
21 | 		JavaRDD<String> data = sparkContext.textFile(inputData);
22 | 		JavaRDD<LabeledPoint> parsedData = data.map(
23 | 				new Function<String, LabeledPoint>() {
24 | 					public LabeledPoint call(String line) {
25 | 						String[] parts = line.split(",");
26 | 						String[] features = parts[1].split(" ");
27 | 						double[] featureVector = new double[features.length];
28 | 						for (int i = 0; i < features.length - 1; i++){
29 | 							featureVector[i] = Double.parseDouble(features[i]);
30 | 						}
31 | 						return new LabeledPoint(Double.parseDouble(parts[0]), Vectors.dense(featureVector));
32 | 					}
33 | 				}
34 | 		);
35 | 		parsedData.cache();
36 | 
37 | 		// Building the model
38 | 		int numIterations = 100;
39 | 		final LinearRegressionModel model = 
40 | 				LinearRegressionWithSGD.train(JavaRDD.toRDD(parsedData), numIterations);
41 | 
42 | 		// Evaluate model on training examples and compute training error
43 | 		JavaRDD<Tuple2<Double, Double>> valuesAndPreds = parsedData.map(
44 | 				new Function<LabeledPoint, Tuple2<Double, Double>>() {
45 | 					public Tuple2<Double, Double> call(LabeledPoint point) {
46 | 						double prediction = model.predict(point.features());
47 | 						return new Tuple2<Double, Double>(prediction, point.label());
48 | 					}
49 | 				}
50 | 		);
51 | 		double MSE = new JavaDoubleRDD(valuesAndPreds.map(
52 | 				new Function<Tuple2<Double, Double>, Object>() {
53 | 					public Object call(Tuple2<Double, Double> pair) {
54 | 						return Math.pow(pair._1() - pair._2(), 2.0);
55 | 					}
56 | 				}
57 | 		).rdd()).mean();
58 | 		System.out.println("training Mean Squared Error = " + MSE);
59 | 	}
60 | }


--------------------------------------------------------------------------------
/Chapter07/Code/OnlineLogisticRegressionTest.java:
--------------------------------------------------------------------------------
 1 | package chapter4.src.logistic;
 2 | 
 3 | import com.google.common.base.Charsets;
 4 | import com.google.common.io.Resources;
 5 | 
 6 | import org.apache.mahout.math.Matrix;
 7 | import org.apache.mahout.math.SequentialAccessSparseVector;
 8 | import org.apache.mahout.math.Vector;
 9 | import org.apache.mahout.classifier.evaluation.Auc;
10 | import org.apache.mahout.classifier.sgd.CsvRecordFactory;
11 | import org.apache.mahout.classifier.sgd.LogisticModelParameters;
12 | import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
13 | 
14 | import java.io.BufferedReader;
15 | import java.io.File;
16 | import java.io.FileInputStream;
17 | import java.io.IOException;
18 | import java.io.InputStream;
19 | import java.io.InputStreamReader;
20 | import java.io.OutputStreamWriter;
21 | import java.io.PrintWriter;
22 | import java.util.Locale;
23 | 
24 | public class OnlineLogisticRegressionTest {
25 | 
26 | 	  private static String inputFile="data/weather.numeric.test.csv";
27 | 	  private static String modelFile="model/model";
28 | 	  private static boolean showAuc;
29 | 	  private static boolean showScores;
30 | 	  private static boolean showConfusion;
31 | 	  static BufferedReader open(String inputFile) throws IOException {
32 | 		    InputStream in;
33 | 		    try {
34 | 		      in = Resources.getResource(inputFile).openStream();
35 | 		    } catch (IllegalArgumentException e) {
36 | 		      in = new FileInputStream(new File(inputFile));
37 | 		    }
38 | 		    return new BufferedReader(new InputStreamReader(in, Charsets.UTF_8));
39 | 		  }
40 | 	  
41 | 	  public static void main(String[] args) throws Exception {
42 | 		   showAuc = true;
43 | 	        showConfusion = true;
44 | 	        Auc collector = new Auc();
45 | 	        LogisticModelParameters lmp = LogisticModelParameters.loadFrom(new File(modelFile));
46 | 	        CsvRecordFactory csv = lmp.getCsvRecordFactory();
47 | 	        OnlineLogisticRegression lr = lmp.createRegression();
48 | 	        BufferedReader in = OnlineLogisticRegressionTest.open(inputFile);
49 | 	        String line = in.readLine();
50 | 	        csv.firstLine(line);
51 | 	        line = in.readLine();
52 | 	        PrintWriter output=new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true);
53 | 	        output.println("\"target\",\"model-output\",\"log-likelihood\"");
54 | 	        while (line != null) {
55 | 	        	System.out.println("-----" + line);
56 | 	            Vector v = new SequentialAccessSparseVector(lmp.getNumFeatures());
57 | 	            int target = csv.processLine(line, v);
58 | 	            double score = lr.classifyScalarNoLink(v);
59 | 	            output.printf(Locale.ENGLISH, "%d,%.3f,%.6f%n", target, score, lr.logLikelihood(target, v));
60 |             collector.add(target, score);
61 | 	            line = in.readLine();
62 | 	            System.out.println("I am here");
63 | 	          }
64 | 	        output.printf(Locale.ENGLISH, "AUC = %.2f%n", collector.auc());
65 | 	        Matrix m = collector.confusion();
66 | 	        output.printf(Locale.ENGLISH, "confusion: [[%.1f, %.1f], [%.1f, %.1f]]%n",
67 | 	          m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
68 | 	        m = collector.entropy();
69 | 	        output.printf(Locale.ENGLISH, "entropy: [[%.1f, %.1f], [%.1f, %.1f]]%n",
70 | 	          m.get(0, 0), m.get(1, 0), m.get(0, 1), m.get(1, 1));
71 | 		  }
72 | 
73 | }


--------------------------------------------------------------------------------
/Chapter07/Code/OnlineLogisticRegressionTrain.java:
--------------------------------------------------------------------------------
  1 | package chapter4.src.logistic;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.FileOutputStream;
  5 | import java.io.FileReader;
  6 | import java.io.IOException;
  7 | import java.io.OutputStream;
  8 | import java.io.OutputStreamWriter;
  9 | import java.io.PrintWriter;
 10 | import java.util.Arrays;
 11 | import java.util.List;
 12 | import java.util.Locale;
 13 | 
 14 | import org.apache.mahout.classifier.sgd.CsvRecordFactory;
 15 | import org.apache.mahout.classifier.sgd.LogisticModelParameters;
 16 | import org.apache.mahout.classifier.sgd.OnlineLogisticRegression;
 17 | import org.apache.mahout.classifier.sgd.RecordFactory;
 18 | import org.apache.mahout.math.RandomAccessSparseVector;
 19 | import org.apache.mahout.math.Vector;
 20 | 
 21 | import com.google.common.base.Charsets;
 22 | 
 23 | 
 24 | public class OnlineLogisticRegressionTrain {
 25 | 	private static double predictorWeight(OnlineLogisticRegression lr, int row, RecordFactory csv, String predictor) {
 26 | 		double weight = 0;
 27 | 		for (Integer column : csv.getTraceDictionary().get(predictor)) {
 28 | 			weight += lr.getBeta().get(row, column);
 29 | 		}
 30 | 		return weight;
 31 | 	}
 32 | 	public static void main(String[] args) throws IOException 
 33 | 	{
 34 | 		String inputFile = "data/weather.numeric.csv";
 35 | 		String outputFile = "model/model";
 36 | 
 37 | 		/* List<String> predictorList =Arrays.asList("age","job","marital","education","default",
 38 |         		"housing","loan","contact","month","day_of_week","duration","campaign","pdays","previous","poutcome",
 39 |         		"emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed");
 40 |         List<String> typeList = Arrays.asList("n", "w", "w", "w", "w", "w", "w", "w", "w", "w", "n", "n", "n", "n",
 41 |         		"w", "n", "n", "n", "n", "n");*/
 42 | 
 43 | 		/*List<String> predictorList =Arrays.asList("sepallength", "sepalwidth", "petallength", "petalwidth", "class");
 44 | 		List<String> typeList = Arrays.asList("n", "n", "n", "n", "w");*/
 45 | 		List<String> predictorList =Arrays.asList("outlook", "temperature", "humidity", "windy", "play");
 46 | 		List<String> typeList = Arrays.asList("w", "n", "n", "w", "w");
 47 | 		LogisticModelParameters lmp = new LogisticModelParameters();
 48 | 		lmp.setTargetVariable("play");
 49 | 		lmp.setMaxTargetCategories(2);
 50 | 		lmp.setNumFeatures(4);
 51 | 		lmp.setUseBias(false);
 52 | 		lmp.setTypeMap(predictorList,typeList);
 53 | 		lmp.setLearningRate(0.5);
 54 | 
 55 | 
 56 | 		int passes = 50;
 57 | 		OnlineLogisticRegression lr;    
 58 | 
 59 | 		CsvRecordFactory csv = lmp.getCsvRecordFactory();
 60 | 		lr = lmp.createRegression();
 61 | 
 62 | 
 63 | 		int k = 0;
 64 | 
 65 | 		for (int pass = 0; pass < passes; pass++) {
 66 | 			BufferedReader in = new BufferedReader(new FileReader(inputFile));
 67 | 
 68 | 			csv.firstLine(in.readLine());
 69 | 
 70 | 			String line = in.readLine();
 71 | 			System.out.println(line);
 72 | 			int lineCount = 2;
 73 | 			while (line != null) {
 74 | 				System.out.println("line " + lineCount);
 75 | 				System.out.println(lmp.getNumFeatures());
 76 | 				Vector input = new RandomAccessSparseVector(lmp.getNumFeatures());
 77 | 				int targetValue = csv.processLine(line, input);
 78 | 
 79 | 				// update model
 80 | 				lr.train(targetValue, input);
 81 | 				k++;
 82 | 
 83 | 				line = in.readLine();
 84 | 				lineCount++;
 85 | 			}
 86 | 			in.close();
 87 | 		}
 88 | 
 89 | 		OutputStream modelOutput = new FileOutputStream(outputFile);
 90 | 		try {
 91 | 			lmp.saveTo(modelOutput);
 92 | 		} finally {
 93 | 			modelOutput.close();
 94 | 		}
 95 | 		PrintWriter output=new PrintWriter(new OutputStreamWriter(System.out, Charsets.UTF_8), true);
 96 | 		output.println(lmp.getNumFeatures());
 97 | 		output.println(lmp.getTargetVariable() + " ~ ");
 98 | 		String sep = "";
 99 | 		for (String v : csv.getTraceDictionary().keySet()) {
100 | 			double weight = predictorWeight(lr, 0, csv, v);
101 | 			if (weight != 0) {
102 | 				output.printf(Locale.ENGLISH, "%s%.3f*%s", sep, weight, v);
103 | 				sep = " + ";
104 | 			}
105 | 		}
106 | 		output.printf("%n");
107 | 		for (int row = 0; row < lr.getBeta().numRows(); row++) {
108 | 			for (String key : csv.getTraceDictionary().keySet()) {
109 | 				double weight = predictorWeight(lr, row, csv, key);
110 | 				if (weight != 0) {
111 | 					output.printf(Locale.ENGLISH, "%20s %.5f%n", key, weight);
112 | 				}
113 | 			}
114 | 			for (int column = 0; column < lr.getBeta().numCols(); column++) {
115 | 				output.printf(Locale.ENGLISH, "%15.9f ", lr.getBeta().get(row, column));
116 | 			}
117 | 			output.println();
118 | 		}
119 | 	}
120 | }


--------------------------------------------------------------------------------
/Chapter07/Code/RandomForestMlib.java:
--------------------------------------------------------------------------------
 1 | package com.data.big.mlib;
 2 | 
 3 | import scala.Tuple2;
 4 | import java.util.HashMap;
 5 | import org.apache.spark.SparkConf;
 6 | import org.apache.spark.api.java.JavaPairRDD;
 7 | import org.apache.spark.api.java.JavaRDD;
 8 | import org.apache.spark.api.java.JavaSparkContext;
 9 | import org.apache.spark.api.java.function.Function;
10 | import org.apache.spark.api.java.function.PairFunction;
11 | import org.apache.spark.mllib.regression.LabeledPoint;
12 | import org.apache.spark.mllib.tree.RandomForest;
13 | import org.apache.spark.mllib.tree.model.RandomForestModel;
14 | import org.apache.spark.mllib.util.MLUtils;
15 | 
16 | public class RandomForestMlib {
17 | 	public static void main(String args[]){
18 | 
19 | 		SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("Any");
20 | 		JavaSparkContext sc = new JavaSparkContext(configuration);
21 | 
22 | 		// Load and parse the data file.
23 | 		String input = "data/rf-data.txt";
24 | 		JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc.sc(), input).toJavaRDD();
25 | 		// Split the data into training and test sets (30% held out for testing)
26 | 		JavaRDD<LabeledPoint>[] dataSplits = data.randomSplit(new double[]{0.7, 0.3});
27 | 		JavaRDD<LabeledPoint> trainingData = dataSplits[0];
28 | 		JavaRDD<LabeledPoint> testData = dataSplits[1];
29 | 
30 | 		// Train a RandomForest model.
31 | 		Integer numClasses = 2;
32 | 		HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<Integer, Integer>();//  Empty categoricalFeaturesInfo indicates all features are continuous.
33 | 		Integer numTrees = 3; // Use more in practice.
34 | 		String featureSubsetStrategy = "auto"; // Let the algorithm choose.
35 | 		String impurity = "gini";
36 | 		Integer maxDepth = 5;
37 | 		Integer maxBins = 32;
38 | 		Integer seed = 12345;
39 | 
40 | 		final RandomForestModel rfModel = RandomForest.trainClassifier(trainingData, numClasses,
41 | 				categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins,
42 | 				seed);
43 | 
44 | 		// Evaluate model on test instances and compute test error
45 | 		JavaPairRDD<Double, Double> label =
46 | 				testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() {
47 | 					public Tuple2<Double, Double> call(LabeledPoint p) {
48 | 						return new Tuple2<Double, Double>(rfModel.predict(p.features()), p.label());
49 | 					}
50 | 				});
51 | 
52 | 		Double testError =
53 | 				1.0 * label.filter(new Function<Tuple2<Double, Double>, Boolean>() {
54 | 					public Boolean call(Tuple2<Double, Double> pl) {
55 | 						return !pl._1().equals(pl._2());
56 | 					}
57 | 				}).count() / testData.count();
58 | 
59 | 		System.out.println("Test Error: " + testError);
60 | 		System.out.println("Learned classification forest model:\n" + rfModel.toDebugString());
61 | 	}
62 | }


--------------------------------------------------------------------------------
/Chapter07/Code/ScalaTest.java:
--------------------------------------------------------------------------------
 1 | package com.data.big.mlib;
 2 | 
 3 | import org.apache.spark.SparkConf;
 4 | import org.apache.spark.api.java.JavaRDD;
 5 | import org.apache.spark.api.java.JavaSparkContext;
 6 | import org.apache.spark.api.java.function.Function;
 7 | 
 8 | 
 9 | public class ScalaTest {
10 | 	public static void main( String[] args ){
11 | 		String inputFile = "data/dummy.txt";
12 | 		SparkConf configuration = new SparkConf().setMaster("local[4]").setAppName("My App");
13 | 		JavaSparkContext sparkContext = new JavaSparkContext(configuration);
14 | 		JavaRDD<String> logData = sparkContext.textFile(inputFile).cache();
15 | 
16 | 		long numberA = logData.filter(new Function<String,Boolean>(){
17 | 			private static final long serialVersionUID = 1L;
18 | 			public Boolean call(String s){
19 | 				return s.length() == 0;
20 | 			}
21 | 		}).count();
22 | 		sparkContext.close();
23 | 		System.out.println("Empty Lines: " + numberA);
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/Chapter08/Chap-08-Code.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Java-Data-Science-Cookbook/b2bf9ef367bf6c04a96e24123e4160b733b7fed9/Chapter08/Chap-08-Code.rar


--------------------------------------------------------------------------------
/Chapter08/Chap-08-Code/Code/DBNIrisExample.java:
--------------------------------------------------------------------------------
  1 | package deepbelief.chap8.science.data;
  2 | 
  3 | 
  4 | import org.deeplearning4j.datasets.iterator.DataSetIterator;
  5 | import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator;
  6 | import org.deeplearning4j.eval.Evaluation;
  7 | import org.deeplearning4j.nn.api.OptimizationAlgorithm;
  8 | import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
  9 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
 10 | import org.deeplearning4j.nn.conf.Updater;
 11 | import org.deeplearning4j.nn.conf.layers.OutputLayer;
 12 | import org.deeplearning4j.nn.conf.layers.RBM;
 13 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
 14 | import org.deeplearning4j.nn.params.DefaultParamInitializer;
 15 | import org.deeplearning4j.nn.weights.WeightInit;
 16 | import org.deeplearning4j.optimize.api.IterationListener;
 17 | import org.deeplearning4j.optimize.listeners.ScoreIterationListener;
 18 | import org.nd4j.linalg.api.ndarray.INDArray;
 19 | import org.nd4j.linalg.dataset.DataSet;
 20 | import org.nd4j.linalg.dataset.SplitTestAndTrain;
 21 | import org.nd4j.linalg.factory.Nd4j;
 22 | import org.nd4j.linalg.lossfunctions.LossFunctions;
 23 | import org.slf4j.Logger;
 24 | import org.slf4j.LoggerFactory;
 25 | 
 26 | 
 27 | import java.util.Arrays;
 28 | import java.util.Random;
 29 | 
 30 | public class DBNIrisExample {
 31 | 
 32 |     private static Logger log = LoggerFactory.getLogger(DBNIrisExample.class);
 33 | 
 34 |     public static void main(String[] args) throws Exception {
 35 |         // Customizing params
 36 |         Nd4j.MAX_SLICES_TO_PRINT = -1;
 37 |         Nd4j.MAX_ELEMENTS_PER_SLICE = -1;
 38 | 
 39 |         final int numRows = 4;
 40 |         final int numColumns = 1;
 41 |         int outputNum = 3;
 42 |         int numSamples = 150;
 43 |         int batchSize = 150;
 44 |         int iterations = 5;
 45 |         int splitTrainNum = (int) (batchSize * .8);
 46 |         int seed = 123;
 47 |         int listenerFreq = 1;
 48 | 
 49 |         log.info("Load data....");
 50 |         DataSetIterator iter = new IrisDataSetIterator(batchSize, numSamples);
 51 |         DataSet next = iter.next();
 52 |         next.normalizeZeroMeanZeroUnitVariance();
 53 | 
 54 |         log.info("Split data....");
 55 |         SplitTestAndTrain testAndTrain = next.splitTestAndTrain(splitTrainNum, new Random(seed));
 56 |         DataSet train = testAndTrain.getTrain();
 57 |         DataSet test = testAndTrain.getTest();
 58 |         Nd4j.ENFORCE_NUMERICAL_STABILITY = true;
 59 | 
 60 |         log.info("Build model....");
 61 |         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
 62 |             .seed(seed) // Locks in weight initialization for tuning
 63 |             .iterations(iterations) // # training iterations predict/classify & backprop
 64 |             .learningRate(1e-6f) // Optimization step size
 65 |             .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT) // Backprop to calculate gradients
 66 |             .l1(1e-1).regularization(true).l2(2e-4)
 67 |             .useDropConnect(true)
 68 |             .list(2) // # NN layers (doesn't count input layer)
 69 |           .layer(0, new RBM.Builder(RBM.HiddenUnit.RECTIFIED, RBM.VisibleUnit.GAUSSIAN)
 70 |             .nIn(numRows * numColumns) // # input nodes
 71 |             .nOut(3) // # fully connected hidden layer nodes. Add list if multiple layers.
 72 |             .weightInit(WeightInit.XAVIER) // Weight initialization
 73 |             .k(1) // # contrastive divergence iterations
 74 |             .activation("relu") // Activation function type
 75 |             .lossFunction(LossFunctions.LossFunction.RMSE_XENT) // Loss function type
 76 |             .updater(Updater.ADAGRAD)
 77 |             .dropOut(0.5)
 78 |             .build()
 79 |           ) // NN layer type
 80 |           .layer(1, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
 81 |             .nIn(3) // # input nodes
 82 |             .nOut(outputNum) // # output nodes
 83 |             .activation("softmax")
 84 |             .build()
 85 |         ) // NN layer type
 86 |         .build();
 87 |         MultiLayerNetwork model = new MultiLayerNetwork(conf);
 88 |         model.init();
 89 | //        model.setListeners(Arrays.asList(new ScoreIterationListener(listenerFreq),
 90 | //                new GradientPlotterIterationListener(listenerFreq),
 91 | //                new LossPlotterIterationListener(listenerFreq)));
 92 | 
 93 | 
 94 |         model.setListeners(Arrays.asList((IterationListener) new ScoreIterationListener(listenerFreq)));
 95 |         log.info("Train model....");
 96 |         model.fit(train);
 97 | 
 98 |         log.info("Evaluate weights....");
 99 |         for(org.deeplearning4j.nn.api.Layer layer : model.getLayers()) {
100 |             INDArray w = layer.getParam(DefaultParamInitializer.WEIGHT_KEY);
101 |             log.info("Weights: " + w);
102 |         }
103 | 
104 |         log.info("Evaluate model....");
105 |         Evaluation eval = new Evaluation(outputNum);
106 |         INDArray output = model.output(test.getFeatureMatrix());
107 | 
108 |         for (int i = 0; i < output.rows(); i++) {
109 |             String actual = test.getLabels().getRow(i).toString().trim();
110 |             String predicted = output.getRow(i).toString().trim();
111 |             log.info("actual " + actual + " vs predicted " + predicted);
112 |         }
113 | 
114 |         eval.eval(test.getLabels(), output);
115 |         log.info(eval.stats());
116 |         log.info("****************Example finished********************");
117 | 
118 | 
119 |        /* OutputStream fos = Files.newOutputStream(Paths.get("coefficients.bin"));
120 |         DataOutputStream dos = new DataOutputStream(fos);
121 |         Nd4j.write(model.params(), dos);
122 |         dos.flush();
123 |         dos.close();
124 |         FileUtils.writeStringToFile(new File("conf.json"), model.getLayerWiseConfigurations().toJson());
125 | 
126 |         MultiLayerConfiguration confFromJson = MultiLayerConfiguration.fromJson(FileUtils.readFileToString(new File("conf.json")));
127 |         DataInputStream dis = new DataInputStream(new FileInputStream("coefficients.bin"));
128 |         INDArray newParams = Nd4j.read(dis);
129 |         dis.close();
130 |         MultiLayerNetwork savedNetwork = new MultiLayerNetwork(confFromJson);
131 |         savedNetwork.init();
132 |         savedNetwork.setParams(newParams);
133 |         System.out.println("Original network params " + model.params());
134 |         System.out.println(savedNetwork.params());
135 | 
136 | 
137 | */
138 |     }
139 | }
140 | 


--------------------------------------------------------------------------------
/Chapter08/Chap-08-Code/Code/DeepAutoEncoderExample.java:
--------------------------------------------------------------------------------
 1 | package deepbelief.chap8.science.data;
 2 | import org.deeplearning4j.datasets.fetchers.MnistDataFetcher;
 3 | import org.deeplearning4j.datasets.iterator.impl.MnistDataSetIterator;
 4 | import org.deeplearning4j.nn.api.OptimizationAlgorithm;
 5 | import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
 6 | import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
 7 | import org.deeplearning4j.nn.conf.layers.OutputLayer;
 8 | import org.deeplearning4j.nn.conf.layers.RBM;
 9 | import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
10 | import org.deeplearning4j.optimize.api.IterationListener;
11 | import org.deeplearning4j.optimize.listeners.ScoreIterationListener;
12 | import org.nd4j.linalg.dataset.DataSet;
13 | import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
14 | import org.nd4j.linalg.lossfunctions.LossFunctions;
15 | import org.slf4j.Logger;
16 | import org.slf4j.LoggerFactory;
17 | 
18 | import java.util.Arrays;
19 | 
20 | /**
21 |  * @author Adam Gibson
22 |  */
23 | public class DeepAutoEncoderExample {
24 | 
25 |     private static Logger log = LoggerFactory.getLogger(DeepAutoEncoderExample.class);
26 | 
27 |     public static void main(String[] args) throws Exception {
28 |         final int numRows = 28;
29 |         final int numColumns = 28;
30 |         int seed = 123;
31 |         int numSamples = MnistDataFetcher.NUM_EXAMPLES;
32 |         int batchSize = 1000;
33 |         int iterations = 1;
34 |         int listenerFreq = iterations/5;
35 | 
36 |         log.info("Load data....");
37 |         DataSetIterator iter = new MnistDataSetIterator(batchSize,numSamples,true);
38 | 
39 |         log.info("Build model....");
40 |         MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder()
41 |                 .seed(seed)
42 |                 .iterations(iterations)
43 |                 .optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT)
44 |                 .list(10)
45 |                 .layer(0, new RBM.Builder().nIn(numRows * numColumns).nOut(1000).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
46 |                 .layer(1, new RBM.Builder().nIn(1000).nOut(500).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
47 |                 .layer(2, new RBM.Builder().nIn(500).nOut(250).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
48 |                 .layer(3, new RBM.Builder().nIn(250).nOut(100).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
49 |                 .layer(4, new RBM.Builder().nIn(100).nOut(30).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) //encoding stops
50 |                 .layer(5, new RBM.Builder().nIn(30).nOut(100).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build()) //decoding starts
51 |                 .layer(6, new RBM.Builder().nIn(100).nOut(250).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
52 |                 .layer(7, new RBM.Builder().nIn(250).nOut(500).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
53 |                 .layer(8, new RBM.Builder().nIn(500).nOut(1000).lossFunction(LossFunctions.LossFunction.RMSE_XENT).build())
54 |                 .layer(9, new OutputLayer.Builder(LossFunctions.LossFunction.RMSE_XENT).nIn(1000).nOut(numRows*numColumns).build())
55 |                 .pretrain(true).backprop(true)
56 |                 .build();
57 | 
58 |         MultiLayerNetwork model = new MultiLayerNetwork(conf);
59 |         model.init();
60 | 
61 |         model.setListeners(Arrays.asList((IterationListener) new ScoreIterationListener(listenerFreq)));
62 | 
63 |         log.info("Train model....");
64 |         while(iter.hasNext()) {
65 |             DataSet next = iter.next();
66 |             model.fit(new DataSet(next.getFeatureMatrix(),next.getFeatureMatrix()));
67 |         }
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/Chapter08/Chap-08-Code/Code/Word2VecRawTextExample.java:
--------------------------------------------------------------------------------
 1 | package word2vec.chap8.science.data;
 2 | 
 3 | import org.deeplearning4j.models.embeddings.WeightLookupTable;
 4 | import org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable;
 5 | import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
 6 | import org.deeplearning4j.models.word2vec.Word2Vec;
 7 | import org.deeplearning4j.models.word2vec.wordstore.inmemory.InMemoryLookupCache;
 8 | import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
 9 | import org.deeplearning4j.text.sentenceiterator.UimaSentenceIterator;
10 | import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
11 | import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
12 | import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
13 | 
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import java.util.ArrayList;
18 | import java.util.Collection;
19 | 
20 | public class Word2VecRawTextExample {
21 | 
22 |     private static Logger log = LoggerFactory.getLogger(Word2VecRawTextExample.class);
23 | 
24 |     public static void main(String[] args) throws Exception {
25 | 
26 |         // Gets Path to Text file
27 |         String filePath = "c:/raw_sentences.txt";
28 | 
29 |         log.info("Load & Vectorize Sentences....");
30 |         // Strip white space before and after for each line
31 |         SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
32 |         // Split on white spaces in the line to get words
33 |         TokenizerFactory t = new DefaultTokenizerFactory();
34 |         t.setTokenPreProcessor(new CommonPreprocessor());
35 | 
36 |         InMemoryLookupCache cache = new InMemoryLookupCache();
37 |         WeightLookupTable table = new InMemoryLookupTable.Builder()
38 |                 .vectorLength(100)
39 |                 .useAdaGrad(false)
40 |                 .cache(cache)
41 |                 .lr(0.025f).build();
42 | 
43 |         log.info("Building model....");
44 |         Word2Vec vec = new Word2Vec.Builder()
45 |                 .minWordFrequency(5).iterations(1)
46 |                 .layerSize(100).lookupTable(table)
47 |                 .stopWords(new ArrayList<String>())
48 |                 .vocabCache(cache).seed(42)
49 |                 .windowSize(5).iterate(iter).tokenizerFactory(t).build();
50 | 
51 |         log.info("Fitting Word2Vec model....");
52 |         vec.fit();
53 | 
54 |         log.info("Writing word vectors to text file....");
55 |         // Write word
56 |         WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");
57 | 
58 |         log.info("Closest Words:");
59 |         Collection<String> lst = vec.wordsNearest("man", 5); 
60 |         System.out.println(lst);
61 |         double cosSim = vec.similarity("cruise", "voyage");
62 |         System.out.println(cosSim);
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/Chapter09/data/AreaPlot.java:
--------------------------------------------------------------------------------
  1 | package chap9.java.science.data;
  2 | 
  3 | /*
  4 |  * GRAL: GRAphing Library for Java(R)
  5 |  *
  6 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
  7 |  * Michael Seifert <michael[at]erichseifert.de>
  8 |  *
  9 |  * This file is part of GRAL.
 10 |  *
 11 |  * GRAL is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * GRAL is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | import java.awt.Color;
 26 | import java.util.Random;
 27 | 
 28 | import de.erichseifert.gral.data.DataSeries;
 29 | import de.erichseifert.gral.data.DataSource;
 30 | import de.erichseifert.gral.data.DataTable;
 31 | import de.erichseifert.gral.examples.ExamplePanel;
 32 | import de.erichseifert.gral.plots.XYPlot;
 33 | import de.erichseifert.gral.plots.areas.AreaRenderer;
 34 | import de.erichseifert.gral.plots.areas.DefaultAreaRenderer2D;
 35 | import de.erichseifert.gral.plots.areas.LineAreaRenderer2D;
 36 | import de.erichseifert.gral.plots.lines.DefaultLineRenderer2D;
 37 | import de.erichseifert.gral.plots.lines.LineRenderer;
 38 | import de.erichseifert.gral.plots.points.DefaultPointRenderer2D;
 39 | import de.erichseifert.gral.plots.points.PointRenderer;
 40 | import de.erichseifert.gral.ui.InteractivePanel;
 41 | import de.erichseifert.gral.util.GraphicsUtils;
 42 | import de.erichseifert.gral.util.Insets2D;
 43 | 
 44 | public class AreaPlot extends ExamplePanel {
 45 | 	/** Version id for serialization. */
 46 | 	private static final long serialVersionUID = 3287044991898775949L;
 47 | 
 48 | 	/** Instance to generate random data values. */
 49 | 	private static final Random random = new Random();
 50 | 
 51 | 	public AreaPlot() {
 52 | 		// Generate data
 53 | 		DataTable data = new DataTable(Double.class, Double.class, Double.class, Double.class);
 54 | 		for (double x = 0.0; x < 50; x ++) {
 55 | 			double y1 = Double.NaN, y2 = Double.NaN, y3 = Double.NaN;
 56 | 			y1 = random.nextGaussian();
 57 | 			y2 = random.nextGaussian();
 58 | 			y3 = random.nextGaussian();
 59 | 			data.add(x, y1, y2, y3);
 60 | 		}
 61 | 
 62 | 		// Create data series
 63 | 		DataSeries data1 = new DataSeries("series 1", data, 0, 1);
 64 | 		DataSeries data2 = new DataSeries("series 2", data, 0, 2);
 65 | 		DataSeries data3 = new DataSeries("series 3", data, 0, 3);
 66 | 
 67 | 		// Create new xy-plot
 68 | 		XYPlot plot = new XYPlot(data1, data2, data3);
 69 | 		plot.setLegendVisible(true);
 70 | 		plot.setInsets(new Insets2D.Double(20.0, 40.0, 20.0, 20.0));
 71 | 
 72 | 		// Format data series
 73 | 		formatFilledArea(plot, data1, COLOR2);
 74 | 		formatFilledArea(plot, data2, COLOR1);
 75 | 		formatLineArea(plot, data3, GraphicsUtils.deriveDarker(COLOR1));
 76 | 
 77 | 		// Add plot to Swing component
 78 | 		add(new InteractivePanel(plot));
 79 | 	}
 80 | 
 81 | 	private static void formatFilledArea(XYPlot plot, DataSource data, Color color) {
 82 | 		PointRenderer point = new DefaultPointRenderer2D();
 83 | 		point.setColor(color);
 84 | 		plot.setPointRenderer(data, point);
 85 | 		LineRenderer line = new DefaultLineRenderer2D();
 86 | 		line.setColor(color);
 87 | 		line.setGap(3.0);
 88 | 		line.setGapRounded(true);
 89 | 		plot.setLineRenderer(data, line);
 90 | 		AreaRenderer area = new DefaultAreaRenderer2D();
 91 | 		area.setColor(GraphicsUtils.deriveWithAlpha(color, 64));
 92 | 		plot.setAreaRenderer(data, area);
 93 | 	}
 94 | 
 95 | 	private static void formatLineArea(XYPlot plot, DataSource data, Color color) {
 96 | 		PointRenderer point = new DefaultPointRenderer2D();
 97 | 		point.setColor(color);
 98 | 		plot.setPointRenderer(data, point);
 99 | 		plot.setLineRenderer(data, null);
100 | 		AreaRenderer area = new LineAreaRenderer2D();
101 | 		area.setGap(3.0);
102 | 		area.setColor(color);
103 | 		plot.setAreaRenderer(data, area);
104 | 	}
105 | 
106 | 	@Override
107 | 	public String getTitle() {
108 | 		return "Area plot";
109 | 	}
110 | 
111 | 	@Override
112 | 	public String getDescription() {
113 | 		return "Area plot of three series with different styling";
114 | 	}
115 | 
116 | 	public static void main(String[] args) {
117 | 		new AreaPlot().showInFrame();
118 | 	}
119 | }
120 | 


--------------------------------------------------------------------------------
/Chapter09/data/HistogramPlot.java:
--------------------------------------------------------------------------------
  1 | package chap9.java.science.data;
  2 | 
  3 | /*
  4 |  * GRAL: GRAphing Library for Java(R)
  5 |  *
  6 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
  7 |  * Michael Seifert <michael[at]erichseifert.de>
  8 |  *
  9 |  * This file is part of GRAL.
 10 |  *
 11 |  * GRAL is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * GRAL is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | import java.util.Random;
 26 | 
 27 | import de.erichseifert.gral.data.DataSource;
 28 | import de.erichseifert.gral.data.DataTable;
 29 | import de.erichseifert.gral.data.EnumeratedData;
 30 | import de.erichseifert.gral.data.statistics.Histogram1D;
 31 | import de.erichseifert.gral.data.statistics.Statistics;
 32 | import de.erichseifert.gral.examples.ExamplePanel;
 33 | import de.erichseifert.gral.plots.BarPlot;
 34 | import de.erichseifert.gral.ui.InteractivePanel;
 35 | import de.erichseifert.gral.util.GraphicsUtils;
 36 | import de.erichseifert.gral.util.Insets2D;
 37 | import de.erichseifert.gral.util.MathUtils;
 38 | import de.erichseifert.gral.util.Orientation;
 39 | 
 40 | 
 41 | public class HistogramPlot extends ExamplePanel {
 42 | 	/** Version id for serialization. */
 43 | 	private static final long serialVersionUID = 4458280577519421950L;
 44 | 
 45 | 	private static final int SAMPLE_COUNT = 1000;
 46 | 
 47 | 	//@SuppressWarnings("unchecked")
 48 | 	public HistogramPlot() {
 49 | 		// Create example data
 50 | 		Random random = new Random();
 51 | 		DataTable data = new DataTable(Double.class);
 52 | 		for (int i = 0; i < SAMPLE_COUNT; i++) {
 53 | 			data.add(random.nextGaussian());
 54 | 		}
 55 | 
 56 | 		// Create histogram from data
 57 | 		Histogram1D histogram = new Histogram1D(data, Orientation.VERTICAL,
 58 | 				new Number[] {-4.0, -3.2, -2.4, -1.6, -0.8, 0.0, 0.8, 1.6, 2.4, 3.2, 4.0});
 59 | 		// Create a second dimension (x axis) for plotting
 60 | 		DataSource histogram2d = new EnumeratedData(histogram, (-4.0 + -3.2)/2.0, 0.8);
 61 | 
 62 | 		// Create new bar plot
 63 | 		BarPlot plot = new BarPlot(histogram2d);
 64 | 
 65 | 		// Format plot
 66 | 		plot.setInsets(new Insets2D.Double(20.0, 65.0, 50.0, 40.0));
 67 | 		plot.getTitle().setText(
 68 | 				String.format("Distribution of %d random samples", data.getRowCount()));
 69 | 		plot.setBarWidth(0.78);
 70 | 
 71 | 		// Format x axis
 72 | 		plot.getAxisRenderer(BarPlot.AXIS_X).setTickAlignment(0.0);
 73 | 		plot.getAxisRenderer(BarPlot.AXIS_X).setTickSpacing(0.8);
 74 | 		plot.getAxisRenderer(BarPlot.AXIS_X).setMinorTicksVisible(false);
 75 | 		// Format y axis
 76 | 		plot.getAxis(BarPlot.AXIS_Y).setRange(0.0,
 77 | 				MathUtils.ceil(histogram.getStatistics().get(Statistics.MAX)*1.1, 25.0));
 78 | 		plot.getAxisRenderer(BarPlot.AXIS_Y).setTickAlignment(0.0);
 79 | 		plot.getAxisRenderer(BarPlot.AXIS_Y).setMinorTicksVisible(false);
 80 | 		plot.getAxisRenderer(BarPlot.AXIS_Y).setIntersection(-4.4);
 81 | 
 82 | 		// Format bars
 83 | 		plot.getPointRenderer(histogram2d).setColor(
 84 | 			GraphicsUtils.deriveWithAlpha(COLOR1, 128));
 85 | 		plot.getPointRenderer(histogram2d).setValueVisible(true);
 86 | 
 87 | 		// Add plot to Swing component
 88 | 		InteractivePanel panel = new InteractivePanel(plot);
 89 | 		panel.setPannable(false);
 90 | 		panel.setZoomable(false);
 91 | 		add(panel);
 92 | 	}
 93 | 
 94 | 	@Override
 95 | 	public String getTitle() {
 96 | 		return "Histogram plot";
 97 | 	}
 98 | 
 99 | 	@Override
100 | 	public String getDescription() {
101 | 		return String.format("Histogram of %d samples", SAMPLE_COUNT);
102 | 	}
103 | 
104 | 	public static void main(String[] args) {
105 | 		new HistogramPlot().showInFrame();
106 | 	}
107 | }
108 | 


--------------------------------------------------------------------------------
/Chapter09/data/ScatterPlot.java:
--------------------------------------------------------------------------------
 1 | package chap9.java.science.data;
 2 | 
 3 | /*
 4 |  * GRAL: GRAphing Library for Java(R)
 5 |  *
 6 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
 7 |  * Michael Seifert <michael[at]erichseifert.de>
 8 |  *
 9 |  * This file is part of GRAL.
10 |  *
11 |  * GRAL is free software: you can redistribute it and/or modify
12 |  * it under the terms of the GNU Lesser General Public License as published by
13 |  * the Free Software Foundation, either version 3 of the License, or
14 |  * (at your option) any later version.
15 |  *
16 |  * GRAL is distributed in the hope that it will be useful,
17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 |  * GNU Lesser General Public License for more details.
20 |  *
21 |  * You should have received a copy of the GNU Lesser General Public License
22 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
23 |  */
24 | 
25 | import java.awt.BorderLayout;
26 | import java.util.Random;
27 | 
28 | import de.erichseifert.gral.data.DataTable;
29 | import de.erichseifert.gral.examples.ExamplePanel;
30 | import de.erichseifert.gral.plots.XYPlot;
31 | import de.erichseifert.gral.ui.InteractivePanel;
32 | import de.erichseifert.gral.util.Insets2D;
33 | 
34 | 
35 | public class ScatterPlot extends ExamplePanel {
36 | 	/** Version id for serialization. */
37 | 	private static final long serialVersionUID = -412699430625953887L;
38 | 
39 | 	private static final int SAMPLE_COUNT = 100000;
40 | 	/** Instance to generate random data values. */
41 | 	private static final Random random = new Random();
42 | 
43 | 	@SuppressWarnings("unchecked")
44 | 	public ScatterPlot() {
45 | 		// Generate 100,000 data points
46 | 		DataTable data = new DataTable(Double.class, Double.class);
47 | 		for (int i = 0; i <= SAMPLE_COUNT; i++) {
48 | 			data.add(random.nextGaussian()*2.0,  random.nextGaussian()*2.0);
49 | 		}
50 | 
51 | 		// Create a new xy-plot
52 | 		XYPlot plot = new XYPlot(data);
53 | 
54 | 		// Format plot
55 | 		plot.setInsets(new Insets2D.Double(20.0, 40.0, 40.0, 40.0));
56 | 		plot.getTitle().setText(getDescription());
57 | 
58 | 		// Format points
59 | 		plot.getPointRenderer(data).setColor(COLOR1);
60 | 
61 | 		// Add plot to Swing component
62 | 		add(new InteractivePanel(plot), BorderLayout.CENTER);
63 | 	}
64 | 
65 | 	@Override
66 | 	public String getTitle() {
67 | 		return "Scatter plot";
68 | 	}
69 | 
70 | 	@Override
71 | 	public String getDescription() {
72 | 		return String.format("Scatter plot with %d data points", SAMPLE_COUNT);
73 | 	}
74 | 
75 | 	public static void main(String[] args) {
76 | 		new ScatterPlot().showInFrame();
77 | 	}
78 | 
79 | }
80 | 
81 | 


--------------------------------------------------------------------------------
/Chapter09/data/SimpleBarPlot.java:
--------------------------------------------------------------------------------
  1 | package chap9.java.science.data;
  2 | 
  3 | /*
  4 |  * GRAL: GRAphing Library for Java(R)
  5 |  *
  6 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
  7 |  * Michael Seifert <michael[at]erichseifert.de>
  8 |  *
  9 |  * This file is part of GRAL.
 10 |  *
 11 |  * GRAL is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * GRAL is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | import java.awt.BasicStroke;
 26 | import java.awt.Color;
 27 | import java.awt.Font;
 28 | import java.awt.LinearGradientPaint;
 29 | 
 30 | import de.erichseifert.gral.data.DataTable;
 31 | import de.erichseifert.gral.examples.ExamplePanel;
 32 | import de.erichseifert.gral.plots.BarPlot;
 33 | import de.erichseifert.gral.plots.BarPlot.BarRenderer;
 34 | import de.erichseifert.gral.ui.InteractivePanel;
 35 | import de.erichseifert.gral.util.GraphicsUtils;
 36 | import de.erichseifert.gral.util.Insets2D;
 37 | import de.erichseifert.gral.util.Location;
 38 | 
 39 | 
 40 | public class SimpleBarPlot extends ExamplePanel {
 41 | 	/** Version id for serialization. */
 42 | 	private static final long serialVersionUID = -2793954497895054530L;
 43 | 
 44 | 	@SuppressWarnings("unchecked")
 45 | 	public SimpleBarPlot() {
 46 | 		// Create example data
 47 | 		DataTable data = new DataTable(Double.class, Integer.class, String.class);
 48 | 		data.add(0.1,  1, "January");
 49 | 		data.add(0.2,  3, "February");
 50 | 		data.add(0.3, -2, "March");
 51 | 		data.add(0.4,  6, "April");
 52 | 		data.add(0.5, -4, "May");
 53 | 		data.add(0.6,  8, "June");
 54 | 		data.add(0.7,  9, "July");
 55 | 		data.add(0.8, 11, "August");
 56 | 
 57 | 		// Create new bar plot
 58 | 		BarPlot plot = new BarPlot(data);
 59 | 
 60 | 		// Format plot
 61 | 		plot.setInsets(new Insets2D.Double(40.0, 40.0, 40.0, 40.0));
 62 | 		plot.setBarWidth(0.075);
 63 | 
 64 | 		// Format bars
 65 | 		BarRenderer pointRenderer = (BarRenderer) plot.getPointRenderer(data);
 66 | 		pointRenderer.setColor(
 67 | 			new LinearGradientPaint(0f,0f, 0f,1f,
 68 | 					new float[] { 0.0f, 1.0f },
 69 | 					new Color[] { COLOR1, GraphicsUtils.deriveBrighter(COLOR1) }
 70 | 			)
 71 | 		);
 72 | 		/*pointRenderer.setBorderStroke(new BasicStroke(3f));
 73 | 		pointRenderer.setBorderColor(
 74 | 			new LinearGradientPaint(0f,0f, 0f,1f,
 75 | 					new float[] { 0.0f, 1.0f },
 76 | 					new Color[] { GraphicsUtils.deriveBrighter(COLOR1), COLOR1 }
 77 | 			)
 78 | 		);*/
 79 | 		pointRenderer.setValueVisible(true);
 80 | 		pointRenderer.setValueColumn(2);
 81 | 		pointRenderer.setValueLocation(Location.CENTER);
 82 | 		pointRenderer.setValueColor(GraphicsUtils.deriveDarker(COLOR1));
 83 | 	pointRenderer.setValueFont(Font.decode(null).deriveFont(Font.BOLD));
 84 | 
 85 | 		// Add plot to Swing component
 86 | 		add(new InteractivePanel(plot));
 87 | 	}
 88 | 
 89 | 	@Override
 90 | 	public String getTitle() {
 91 | 		return "Bar plot";
 92 | 	}
 93 | 
 94 | 	@Override
 95 | 	public String getDescription() {
 96 | 		return "Bar plot with example data and color gradients";
 97 | 	}
 98 | 
 99 | 	public static void main(String[] args) {
100 | 		new SimpleBarPlot().showInFrame();
101 | 	}
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/Chapter09/data/SimpleBoxPlot.java:
--------------------------------------------------------------------------------
  1 | package chap9.java.science.data;
  2 | 
  3 | /*
  4 |  * GRAL: GRAphing Library for Java(R)
  5 |  *
  6 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
  7 |  * Michael Seifert <michael[at]erichseifert.de>
  8 |  *
  9 |  * This file is part of GRAL.
 10 |  *
 11 |  * GRAL is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * GRAL is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | import java.awt.BasicStroke;
 26 | import java.awt.Color;
 27 | import java.awt.Dimension;
 28 | import java.awt.Stroke;
 29 | import java.util.Random;
 30 | 
 31 | import de.erichseifert.gral.data.DataSource;
 32 | import de.erichseifert.gral.data.DataTable;
 33 | import de.erichseifert.gral.examples.ExamplePanel;
 34 | import de.erichseifert.gral.plots.BoxPlot;
 35 | import de.erichseifert.gral.plots.BoxPlot.BoxWhiskerRenderer;
 36 | import de.erichseifert.gral.plots.XYPlot.XYNavigationDirection;
 37 | import de.erichseifert.gral.plots.colors.LinearGradient;
 38 | import de.erichseifert.gral.plots.colors.ScaledContinuousColorMapper;
 39 | import de.erichseifert.gral.ui.InteractivePanel;
 40 | import de.erichseifert.gral.util.DataUtils;
 41 | import de.erichseifert.gral.util.GraphicsUtils;
 42 | import de.erichseifert.gral.util.Insets2D;
 43 | 
 44 | 
 45 | public class SimpleBoxPlot extends ExamplePanel {
 46 | 	/** Version id for serialization. */
 47 | 	private static final long serialVersionUID = 5228891435595348789L;
 48 | 	private static final int SAMPLE_COUNT = 50;
 49 | 	private static final Random random = new Random();
 50 | 
 51 | 	@SuppressWarnings("unchecked")
 52 | 	public SimpleBoxPlot() {
 53 | 		setPreferredSize(new Dimension(400, 600));
 54 | 
 55 | 		// Create example data
 56 | 		DataTable data = new DataTable(Integer.class, Integer.class, Integer.class);
 57 | 		for (int i = 0; i < SAMPLE_COUNT; i++) {
 58 | 			int x = (int) Math.round(5.0*random.nextGaussian());
 59 | 			int y = (int) Math.round(5.0*random.nextGaussian());
 60 | 			int z = (int) Math.round(5.0*random.nextGaussian());
 61 | 			data.add(x, y, z);
 62 | 		}
 63 | 
 64 | 		// Create new box-and-whisker plot
 65 | 		DataSource boxData = BoxPlot.createBoxData(data);
 66 | 		BoxPlot plot = new BoxPlot(boxData);
 67 | 
 68 | 		// Format plot
 69 | 		plot.setInsets(new Insets2D.Double(20.0, 50.0, 40.0, 20.0));
 70 | 
 71 | 		// Format axes
 72 | 		plot.getAxisRenderer(BoxPlot.AXIS_X).setCustomTicks(
 73 | 			DataUtils.map(
 74 | 					new Double[] {1.0, 2.0, 3.0},
 75 | 					new String[] {"Column 1", "Column 2", "Column 3"}
 76 | 			)
 77 | 		);
 78 | 
 79 | 		// Format boxes
 80 | 		/*Stroke stroke = new BasicStroke(2f);
 81 | 		ScaledContinuousColorMapper colors =
 82 | 			new LinearGradient(GraphicsUtils.deriveBrighter(COLOR1), Color.WHITE);
 83 | 		colors.setRange(1.0, 3.0);*/
 84 | 
 85 | 		BoxWhiskerRenderer pointRenderer =
 86 | 				(BoxWhiskerRenderer) plot.getPointRenderer(boxData);
 87 | 		/*pointRenderer.setWhiskerStroke(stroke);
 88 | 		pointRenderer.setBoxBorderStroke(stroke);
 89 | 		pointRenderer.setBoxBackground(colors);*/
 90 | 		pointRenderer.setBoxBorderColor(COLOR1);
 91 | 		pointRenderer.setWhiskerColor(COLOR1);
 92 | 		pointRenderer.setCenterBarColor(COLOR1);
 93 | 
 94 | 		plot.getNavigator().setDirection(XYNavigationDirection.VERTICAL);
 95 | 
 96 | 		// Add plot to Swing component
 97 | 		InteractivePanel panel = new InteractivePanel(plot);
 98 | 		add(panel);
 99 | 	}
100 | 
101 | 	@Override
102 | 	public String getTitle() {
103 | 		return "Box-and-whisker plot";
104 | 	}
105 | 
106 | 	@Override
107 | 	public String getDescription() {
108 | 		return String.format("Three box-and-whisker plots created from %d random samples", SAMPLE_COUNT);
109 | 	}
110 | 
111 | 	public static void main(String[] args) {
112 | 		new SimpleBoxPlot().showInFrame();
113 | 	}
114 | }
115 | 


--------------------------------------------------------------------------------
/Chapter09/data/SimplePiePlot.java:
--------------------------------------------------------------------------------
  1 | package chap9.java.science.data;
  2 | 
  3 | /*
  4 |  * GRAL: GRAphing Library for Java(R)
  5 |  *
  6 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
  7 |  * Michael Seifert <michael[at]erichseifert.de>
  8 |  *
  9 |  * This file is part of GRAL.
 10 |  *
 11 |  * GRAL is free software: you can redistribute it and/or modify
 12 |  * it under the terms of the GNU Lesser General Public License as published by
 13 |  * the Free Software Foundation, either version 3 of the License, or
 14 |  * (at your option) any later version.
 15 |  *
 16 |  * GRAL is distributed in the hope that it will be useful,
 17 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 18 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 19 |  * GNU Lesser General Public License for more details.
 20 |  *
 21 |  * You should have received a copy of the GNU Lesser General Public License
 22 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
 23 |  */
 24 | 
 25 | import java.awt.BorderLayout;
 26 | import java.awt.Color;
 27 | import java.awt.Font;
 28 | import java.util.Random;
 29 | 
 30 | import de.erichseifert.gral.data.DataTable;
 31 | import de.erichseifert.gral.examples.ExamplePanel;
 32 | import de.erichseifert.gral.plots.PiePlot;
 33 | import de.erichseifert.gral.plots.PiePlot.PieSliceRenderer;
 34 | import de.erichseifert.gral.plots.colors.LinearGradient;
 35 | import de.erichseifert.gral.ui.InteractivePanel;
 36 | import de.erichseifert.gral.util.Insets2D;
 37 | 
 38 | 
 39 | public class SimplePiePlot extends ExamplePanel {
 40 | 	/** Version id for serialization. */
 41 | 	private static final long serialVersionUID = -3039317265508932299L;
 42 | 
 43 | 	private static final int SAMPLE_COUNT = 10;
 44 | 	/** Instance to generate random data values. */
 45 | 	private static Random random = new Random();
 46 | 
 47 | 	@SuppressWarnings("unchecked")
 48 | 	public SimplePiePlot() {
 49 | 		// Create data
 50 | 		DataTable data = new DataTable(Integer.class);
 51 | 		for (int i = 0; i < SAMPLE_COUNT; i++) {
 52 | 			int val = random.nextInt(8) + 2;
 53 | 			data.add((random.nextDouble() <= 0.15) ? -val : val);
 54 | 		}
 55 | 
 56 | 		// Create new pie plot
 57 | 		PiePlot plot = new PiePlot(data);
 58 | 
 59 | 		// Format plot
 60 | 		plot.getTitle().setText(getDescription());
 61 | 		// Change relative size of pie
 62 | 		plot.setRadius(0.9);
 63 | 		// Display a legend
 64 | 		plot.setLegendVisible(true);
 65 | 		// Add some margin to the plot area
 66 | 		plot.setInsets(new Insets2D.Double(20.0, 40.0, 40.0, 40.0));
 67 | 
 68 | 		PieSliceRenderer pointRenderer =
 69 | 				(PieSliceRenderer) plot.getPointRenderer(data);
 70 | 		// Change relative size of inner region
 71 | 		pointRenderer.setInnerRadius(0.4);
 72 | 		// Change the width of gaps between segments
 73 | 		pointRenderer.setGap(0.2);
 74 | 		// Change the colors
 75 | 		LinearGradient colors = new LinearGradient(COLOR1, COLOR2);
 76 | 		pointRenderer.setColor(colors);
 77 | 		// Show labels
 78 | 		pointRenderer.setValueVisible(true);
 79 | 		pointRenderer.setValueColor(Color.WHITE);
 80 | 		pointRenderer.setValueFont(Font.decode(null).deriveFont(Font.BOLD));
 81 | 
 82 | 		// Add plot to Swing component
 83 | 		add(new InteractivePanel(plot), BorderLayout.CENTER);
 84 | 	}
 85 | 
 86 | 	@Override
 87 | 	public String getTitle() {
 88 | 		return "Donut plot";
 89 | 	}
 90 | 
 91 | 	@Override
 92 | 	public String getDescription() {
 93 | 		return String.format("Donut plot of %d random data values", SAMPLE_COUNT);
 94 | 	}
 95 | 
 96 | 	public static void main(String[] args) {
 97 | 		new SimplePiePlot().showInFrame();
 98 | 	}
 99 | }
100 | 
101 | 


--------------------------------------------------------------------------------
/Chapter09/data/SineGraph.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * GRAL: GRAphing Library for Java(R)
 3 |  *
 4 |  * (C) Copyright 2009-2013 Erich Seifert <dev[at]erichseifert.de>,
 5 |  * Michael Seifert <michael[at]erichseifert.de>
 6 |  *
 7 |  * This file is part of GRAL.
 8 |  *
 9 |  * GRAL is free software: you can redistribute it and/or modify
10 |  * it under the terms of the GNU Lesser General Public License as published by
11 |  * the Free Software Foundation, either version 3 of the License, or
12 |  * (at your option) any later version.
13 |  *
14 |  * GRAL is distributed in the hope that it will be useful,
15 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17 |  * GNU Lesser General Public License for more details.
18 |  *
19 |  * You should have received a copy of the GNU Lesser General Public License
20 |  * along with GRAL.  If not, see <http://www.gnu.org/licenses/>.
21 |  */
22 | package chap9.java.science.data;
23 | 
24 | import java.awt.Color;
25 | import java.io.FileNotFoundException;
26 | import java.io.IOException;
27 | 
28 | import javax.swing.JFrame;
29 | 
30 | import de.erichseifert.gral.data.DataTable;
31 | import de.erichseifert.gral.plots.XYPlot;
32 | import de.erichseifert.gral.plots.lines.DefaultLineRenderer2D;
33 | import de.erichseifert.gral.plots.lines.LineRenderer;
34 | import de.erichseifert.gral.ui.InteractivePanel;
35 | 
36 | public class SineGraph extends JFrame {
37 | 	private static final long serialVersionUID = 1L;
38 | 
39 | 	public SineGraph() throws FileNotFoundException, IOException {
40 | 		setDefaultCloseOperation(EXIT_ON_CLOSE);
41 | 		setSize(1600, 1400);
42 | 
43 | 		DataTable data = new DataTable(Double.class, Double.class);
44 | 		for (double x = -5.0; x <= 5.0; x+=0.25) {
45 |             double y = 5.0*Math.sin(x);
46 |             data.add(x, y);
47 |         }
48 | 
49 | 		XYPlot plot = new XYPlot(data);
50 | 		getContentPane().add(new InteractivePanel(plot));
51 | 		LineRenderer lines = new DefaultLineRenderer2D();
52 | 		plot.setLineRenderer(data, lines);
53 | 		Color color = new Color(0.0f, 0.0f, 0.0f);
54 | 		plot.getPointRenderer(data).setColor(color);
55 | 		plot.getLineRenderer(data).setColor(color);
56 | 	}
57 | 
58 | 	public static void main(String[] args) {
59 | 		SineGraph frame = null;
60 | 		try {
61 | 			frame = new SineGraph();
62 | 		} catch (IOException e) {
63 | 		}
64 | 		frame.setVisible(true);
65 | 	}
66 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Java Data Science Cookbook
 5 | This is the code repository for [Java Data Science Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/java-data-science-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781787122536), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 6 | ## About the Book
 7 | If you are looking to build data science models that are good for production, Java has come to the rescue. With the aid of strong libraries such as MLlib, Weka, DL4j, and more, you can efficiently perform all the data science tasks you need to.
 8 | ## Instructions and Navigation
 9 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
10 | 
11 | 
12 | 
13 | The code will look like the following:
14 | ```
15 | classVals = new ArrayList<String>();
16 |  for (int i = 0; i < 5; i++){
17 |  classVals.add("class" + (i + 1));
18 |  }
19 | ```
20 | 
21 | We have used Java to solve real-world data science problems. Our focus was to deliver content that can be effective for anyone who wants to know how to solve problems with Java. A minimum knowledge of Java is required, such as classes, objects, methods, arguments and parameters, exceptions, and exporting Java Archive (JAR) files. The code is well supported with narrations, information, and tips to help the readers understand the
22 | context and purpose. The theories behind the problems solved in this book, on many occasions, are not thoroughly discussed, but references for interested readers are provided whenever necessary.
23 | 
24 | ## Related Products
25 | * [Practical Data Science Cookbook - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/practical-data-science-cookbook-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781787129627)
26 | 
27 | * [Apache Spark for Data Science Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-data-science-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781785880100)
28 | 
29 | * [Mastering Java for Data Science](https://www.packtpub.com/big-data-and-business-intelligence/mastering-java-data-science?utm_source=github&utm_medium=repository&utm_campaign=9781782174271)
30 | 
31 | ### Suggestions and Feedback
32 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions.
33 | ### Download a free PDF
34 | 
35 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
36 | <p align="center"> <a href="https://packt.link/free-ebook/9781787122536">https://packt.link/free-ebook/9781787122536 </a> </p>


--------------------------------------------------------------------------------