├── .gitignore
├── Procfile
├── README.md
├── ch02
├── Email Analysis.xlsb
├── README.md
└── probability.pig
├── ch03
├── README.md
├── cat_avro
├── gmail
│ ├── email.avro.schema
│ ├── email_utils.py
│ ├── gmail.py
│ └── gmail_slurper.py
├── pig
│ ├── avro_to_mongo.pig
│ ├── elasticsearch.pig
│ ├── mongo.pig
│ └── sent_counts.pig
├── python
│ ├── elasticsearch.py
│ ├── flask_echo.py
│ ├── flask_mongo.py
│ ├── mongo.py
│ └── test_avro.py
└── web
│ ├── index.py
│ ├── static
│ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-responsive.css
│ │ │ ├── bootstrap-responsive.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── img
│ │ │ ├── glyphicons-halflings-white.png
│ │ │ └── glyphicons-halflings.png
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3
│ │ ├── d3.v3.js
│ │ └── d3.v3.min.js
│ └── nvd3
│ │ ├── .gitignore
│ │ ├── LICENSE.md
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── build.bat
│ │ ├── build.sh
│ │ ├── deprecated
│ │ ├── bar.html
│ │ ├── bar.js
│ │ ├── charts
│ │ │ ├── cumulativeLineChart.js
│ │ │ ├── discreteBarChart.js
│ │ │ ├── lineChart.js
│ │ │ ├── lineChartDaily.js
│ │ │ └── stackedAreaChart.js
│ │ ├── cumulativeLine.html
│ │ ├── cumulativeLine.js
│ │ ├── discreteBarChartWithEnabledTooltip.html
│ │ ├── discreteBarChartWithEnabledTooltip.js
│ │ ├── discreteBarWithAxes.html
│ │ ├── discreteBarWithAxes.js
│ │ ├── lineChart-old.html
│ │ ├── lineChartDaily.html
│ │ ├── linePlusBar.html
│ │ ├── linePlusBar.js
│ │ ├── lineWithFocus.html
│ │ ├── lineWithFocus.js
│ │ ├── lineWithFourAxes.html
│ │ ├── lineWithFourAxes.js
│ │ ├── lineWithLegend.html
│ │ ├── lineWithLegend.js
│ │ ├── monthendAxis.html
│ │ ├── multiBarHorizontalWithLegend.html
│ │ ├── multiBarHorizontalWithLegend.js
│ │ ├── multiBarWithLegend.html
│ │ ├── multiBarWithLegend.js
│ │ ├── pie.js
│ │ ├── scatterChart.html
│ │ ├── scatterChart.js
│ │ ├── scatterFisheyeChart.js
│ │ ├── scatterWithLegend.html
│ │ ├── scatterWithLegend.js
│ │ ├── stackedArea.js
│ │ ├── stackedAreaChart.html
│ │ ├── stackedAreaChart_old.html
│ │ ├── stackedAreaWithLegend.html
│ │ └── stackedAreaWithLegend.js
│ │ ├── examples
│ │ ├── bullet.html
│ │ ├── bulletChart.html
│ │ ├── crossfilter.html
│ │ ├── crossfilterWithDimentions.html
│ │ ├── crossfilterWithTables.html
│ │ ├── cumulativeLineChart.html
│ │ ├── discreteBarChart.html
│ │ ├── historicalBar.html
│ │ ├── horizon.html
│ │ ├── images
│ │ │ ├── grey-minus.png
│ │ │ └── grey-plus.png
│ │ ├── indentedtree.html
│ │ ├── legend.html
│ │ ├── line.html
│ │ ├── lineChart.html
│ │ ├── lineChartSVGResize.html
│ │ ├── linePlusBarChart.html
│ │ ├── linePlusBarWithFocusChart.html
│ │ ├── lineWithFisheyeChart.html
│ │ ├── lineWithFocusChart.html
│ │ ├── multiBar.html
│ │ ├── multiBarChart.html
│ │ ├── multiBarHorizontalChart.html
│ │ ├── multiChart.html
│ │ ├── nations.json
│ │ ├── pie.html
│ │ ├── pieChart.html
│ │ ├── scatter.html
│ │ ├── scatterChart.html
│ │ ├── scatterPlusLineChart.html
│ │ ├── sparkline.html
│ │ ├── sparklinePlus.html
│ │ ├── stackedArea.html
│ │ ├── stackedAreaChart.html
│ │ └── stream_layers.js
│ │ ├── lib
│ │ ├── cie.js
│ │ ├── crossfilter.js
│ │ ├── crossfilter.min.js
│ │ ├── d3.v2.js
│ │ ├── d3.v2.min.js
│ │ ├── fisheye.js
│ │ ├── hive.js
│ │ ├── horizon.js
│ │ └── sankey.js
│ │ ├── nv.d3.js
│ │ ├── nv.d3.min.js
│ │ └── src
│ │ ├── core.js
│ │ ├── intro.js
│ │ ├── models
│ │ ├── axis.js
│ │ ├── backup
│ │ │ ├── bullet.js
│ │ │ └── bulletChart.js
│ │ ├── bullet.js
│ │ ├── bulletChart.js
│ │ ├── cumulativeLineChart.js
│ │ ├── discreteBar.js
│ │ ├── discreteBarChart.js
│ │ ├── distribution.js
│ │ ├── historicalBar.js
│ │ ├── indentedTree.js
│ │ ├── legend.js
│ │ ├── line.js
│ │ ├── lineChart.js
│ │ ├── linePlusBarChart.js
│ │ ├── linePlusBarWithFocusChart.js
│ │ ├── lineWithFisheye.js
│ │ ├── lineWithFisheyeChart.js
│ │ ├── lineWithFocusChart.js
│ │ ├── multiBar.js
│ │ ├── multiBarChart.js
│ │ ├── multiBarHorizontal.js
│ │ ├── multiBarHorizontalChart.js
│ │ ├── multiBarTimeSeries.js
│ │ ├── multiBarTimeSeriesChart.js
│ │ ├── multiChart.js
│ │ ├── ohlcBar.js
│ │ ├── pie.js
│ │ ├── pieChart.js
│ │ ├── scatter.js
│ │ ├── scatterChart.js
│ │ ├── scatterPlusLineChart.js
│ │ ├── sparkline.js
│ │ ├── sparklinePlus.js
│ │ ├── stackedArea.js
│ │ └── stackedAreaChart.js
│ │ ├── nv.d3.css
│ │ ├── outro.js
│ │ ├── tooltip.js
│ │ └── utils.js
│ └── templates
│ └── table.html
├── ch04
├── .dotcloud
│ └── config
├── README.md
├── __init__.py
├── dotcloud.yml
├── index.py
├── requirements.txt
├── test_dotcloud_mongo.pig
└── wsgi.py
├── ch05
├── README.md
├── avro_to_mongo.pig
├── elasticsearch.pig
├── elasticsearch.py
├── list_emails.mongo.js
├── mongo_list.py
└── web
│ ├── config.py
│ ├── index.py
│ ├── static
│ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-responsive.css
│ │ │ ├── bootstrap-responsive.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── img
│ │ │ ├── glyphicons-halflings-white.png
│ │ │ └── glyphicons-halflings.png
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3
│ │ ├── d3.v3.js
│ │ └── d3.v3.min.js
│ └── nvd3
│ │ ├── .gitignore
│ │ ├── LICENSE.md
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── build.bat
│ │ ├── build.sh
│ │ ├── deprecated
│ │ ├── bar.html
│ │ ├── bar.js
│ │ ├── charts
│ │ │ ├── cumulativeLineChart.js
│ │ │ ├── discreteBarChart.js
│ │ │ ├── lineChart.js
│ │ │ ├── lineChartDaily.js
│ │ │ └── stackedAreaChart.js
│ │ ├── cumulativeLine.html
│ │ ├── cumulativeLine.js
│ │ ├── discreteBarChartWithEnabledTooltip.html
│ │ ├── discreteBarChartWithEnabledTooltip.js
│ │ ├── discreteBarWithAxes.html
│ │ ├── discreteBarWithAxes.js
│ │ ├── lineChart-old.html
│ │ ├── lineChartDaily.html
│ │ ├── linePlusBar.html
│ │ ├── linePlusBar.js
│ │ ├── lineWithFocus.html
│ │ ├── lineWithFocus.js
│ │ ├── lineWithFourAxes.html
│ │ ├── lineWithFourAxes.js
│ │ ├── lineWithLegend.html
│ │ ├── lineWithLegend.js
│ │ ├── monthendAxis.html
│ │ ├── multiBarHorizontalWithLegend.html
│ │ ├── multiBarHorizontalWithLegend.js
│ │ ├── multiBarWithLegend.html
│ │ ├── multiBarWithLegend.js
│ │ ├── pie.js
│ │ ├── scatterChart.html
│ │ ├── scatterChart.js
│ │ ├── scatterFisheyeChart.js
│ │ ├── scatterWithLegend.html
│ │ ├── scatterWithLegend.js
│ │ ├── stackedArea.js
│ │ ├── stackedAreaChart.html
│ │ ├── stackedAreaChart_old.html
│ │ ├── stackedAreaWithLegend.html
│ │ └── stackedAreaWithLegend.js
│ │ ├── examples
│ │ ├── bullet.html
│ │ ├── bulletChart.html
│ │ ├── crossfilter.html
│ │ ├── crossfilterWithDimentions.html
│ │ ├── crossfilterWithTables.html
│ │ ├── cumulativeLineChart.html
│ │ ├── discreteBarChart.html
│ │ ├── historicalBar.html
│ │ ├── horizon.html
│ │ ├── images
│ │ │ ├── grey-minus.png
│ │ │ └── grey-plus.png
│ │ ├── indentedtree.html
│ │ ├── legend.html
│ │ ├── line.html
│ │ ├── lineChart.html
│ │ ├── lineChartSVGResize.html
│ │ ├── linePlusBarChart.html
│ │ ├── linePlusBarWithFocusChart.html
│ │ ├── lineWithFisheyeChart.html
│ │ ├── lineWithFocusChart.html
│ │ ├── multiBar.html
│ │ ├── multiBarChart.html
│ │ ├── multiBarHorizontalChart.html
│ │ ├── multiChart.html
│ │ ├── nations.json
│ │ ├── pie.html
│ │ ├── pieChart.html
│ │ ├── scatter.html
│ │ ├── scatterChart.html
│ │ ├── scatterPlusLineChart.html
│ │ ├── sparkline.html
│ │ ├── sparklinePlus.html
│ │ ├── stackedArea.html
│ │ ├── stackedAreaChart.html
│ │ └── stream_layers.js
│ │ ├── lib
│ │ ├── cie.js
│ │ ├── crossfilter.js
│ │ ├── crossfilter.min.js
│ │ ├── d3.v2.js
│ │ ├── d3.v2.min.js
│ │ ├── fisheye.js
│ │ ├── hive.js
│ │ ├── horizon.js
│ │ └── sankey.js
│ │ ├── nv.d3.js
│ │ ├── nv.d3.min.js
│ │ └── src
│ │ ├── core.js
│ │ ├── intro.js
│ │ ├── models
│ │ ├── axis.js
│ │ ├── backup
│ │ │ ├── bullet.js
│ │ │ └── bulletChart.js
│ │ ├── bullet.js
│ │ ├── bulletChart.js
│ │ ├── cumulativeLineChart.js
│ │ ├── discreteBar.js
│ │ ├── discreteBarChart.js
│ │ ├── distribution.js
│ │ ├── historicalBar.js
│ │ ├── indentedTree.js
│ │ ├── legend.js
│ │ ├── line.js
│ │ ├── lineChart.js
│ │ ├── linePlusBarChart.js
│ │ ├── linePlusBarWithFocusChart.js
│ │ ├── lineWithFisheye.js
│ │ ├── lineWithFisheyeChart.js
│ │ ├── lineWithFocusChart.js
│ │ ├── multiBar.js
│ │ ├── multiBarChart.js
│ │ ├── multiBarHorizontal.js
│ │ ├── multiBarHorizontalChart.js
│ │ ├── multiBarTimeSeries.js
│ │ ├── multiBarTimeSeriesChart.js
│ │ ├── multiChart.js
│ │ ├── ohlcBar.js
│ │ ├── pie.js
│ │ ├── pieChart.js
│ │ ├── scatter.js
│ │ ├── scatterChart.js
│ │ ├── scatterPlusLineChart.js
│ │ ├── sparkline.js
│ │ ├── sparklinePlus.js
│ │ ├── stackedArea.js
│ │ └── stackedAreaChart.js
│ │ ├── nv.d3.css
│ │ ├── outro.js
│ │ ├── tooltip.js
│ │ └── utils.js
│ └── templates
│ ├── layout.html
│ ├── macros.jnj
│ └── partials
│ ├── email.html
│ └── emails.html
├── ch06
├── README.md
├── emails_per_email_address.pig
├── list_addresses.py
├── mongo.js
├── sent_distributions.pig
└── web
│ ├── config.py
│ ├── index.py
│ ├── static
│ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-responsive.css
│ │ │ ├── bootstrap-responsive.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── img
│ │ │ ├── glyphicons-halflings-white.png
│ │ │ └── glyphicons-halflings.png
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3
│ │ ├── d3.v3.js
│ │ └── d3.v3.min.js
│ └── nvd3
│ │ ├── .gitignore
│ │ ├── LICENSE.md
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── build.bat
│ │ ├── build.sh
│ │ ├── deprecated
│ │ ├── bar.html
│ │ ├── bar.js
│ │ ├── charts
│ │ │ ├── cumulativeLineChart.js
│ │ │ ├── discreteBarChart.js
│ │ │ ├── lineChart.js
│ │ │ ├── lineChartDaily.js
│ │ │ └── stackedAreaChart.js
│ │ ├── cumulativeLine.html
│ │ ├── cumulativeLine.js
│ │ ├── discreteBarChartWithEnabledTooltip.html
│ │ ├── discreteBarChartWithEnabledTooltip.js
│ │ ├── discreteBarWithAxes.html
│ │ ├── discreteBarWithAxes.js
│ │ ├── lineChart-old.html
│ │ ├── lineChartDaily.html
│ │ ├── linePlusBar.html
│ │ ├── linePlusBar.js
│ │ ├── lineWithFocus.html
│ │ ├── lineWithFocus.js
│ │ ├── lineWithFourAxes.html
│ │ ├── lineWithFourAxes.js
│ │ ├── lineWithLegend.html
│ │ ├── lineWithLegend.js
│ │ ├── monthendAxis.html
│ │ ├── multiBarHorizontalWithLegend.html
│ │ ├── multiBarHorizontalWithLegend.js
│ │ ├── multiBarWithLegend.html
│ │ ├── multiBarWithLegend.js
│ │ ├── pie.js
│ │ ├── scatterChart.html
│ │ ├── scatterChart.js
│ │ ├── scatterFisheyeChart.js
│ │ ├── scatterWithLegend.html
│ │ ├── scatterWithLegend.js
│ │ ├── stackedArea.js
│ │ ├── stackedAreaChart.html
│ │ ├── stackedAreaChart_old.html
│ │ ├── stackedAreaWithLegend.html
│ │ └── stackedAreaWithLegend.js
│ │ ├── examples
│ │ ├── bullet.html
│ │ ├── bulletChart.html
│ │ ├── crossfilter.html
│ │ ├── crossfilterWithDimentions.html
│ │ ├── crossfilterWithTables.html
│ │ ├── cumulativeLineChart.html
│ │ ├── discreteBarChart.html
│ │ ├── historicalBar.html
│ │ ├── horizon.html
│ │ ├── images
│ │ │ ├── grey-minus.png
│ │ │ └── grey-plus.png
│ │ ├── indentedtree.html
│ │ ├── legend.html
│ │ ├── line.html
│ │ ├── lineChart.html
│ │ ├── lineChartSVGResize.html
│ │ ├── linePlusBarChart.html
│ │ ├── linePlusBarWithFocusChart.html
│ │ ├── lineWithFisheyeChart.html
│ │ ├── lineWithFocusChart.html
│ │ ├── multiBar.html
│ │ ├── multiBarChart.html
│ │ ├── multiBarHorizontalChart.html
│ │ ├── multiChart.html
│ │ ├── nations.json
│ │ ├── pie.html
│ │ ├── pieChart.html
│ │ ├── scatter.html
│ │ ├── scatterChart.html
│ │ ├── scatterPlusLineChart.html
│ │ ├── sparkline.html
│ │ ├── sparklinePlus.html
│ │ ├── stackedArea.html
│ │ ├── stackedAreaChart.html
│ │ └── stream_layers.js
│ │ ├── lib
│ │ ├── cie.js
│ │ ├── crossfilter.js
│ │ ├── crossfilter.min.js
│ │ ├── d3.v2.js
│ │ ├── d3.v2.min.js
│ │ ├── fisheye.js
│ │ ├── hive.js
│ │ ├── horizon.js
│ │ └── sankey.js
│ │ ├── nv.d3.js
│ │ ├── nv.d3.min.js
│ │ └── src
│ │ ├── core.js
│ │ ├── intro.js
│ │ ├── models
│ │ ├── axis.js
│ │ ├── backup
│ │ │ ├── bullet.js
│ │ │ └── bulletChart.js
│ │ ├── bullet.js
│ │ ├── bulletChart.js
│ │ ├── cumulativeLineChart.js
│ │ ├── discreteBar.js
│ │ ├── discreteBarChart.js
│ │ ├── distribution.js
│ │ ├── historicalBar.js
│ │ ├── indentedTree.js
│ │ ├── legend.js
│ │ ├── line.js
│ │ ├── lineChart.js
│ │ ├── linePlusBarChart.js
│ │ ├── linePlusBarWithFocusChart.js
│ │ ├── lineWithFisheye.js
│ │ ├── lineWithFisheyeChart.js
│ │ ├── lineWithFocusChart.js
│ │ ├── multiBar.js
│ │ ├── multiBarChart.js
│ │ ├── multiBarHorizontal.js
│ │ ├── multiBarHorizontalChart.js
│ │ ├── multiBarTimeSeries.js
│ │ ├── multiBarTimeSeriesChart.js
│ │ ├── multiChart.js
│ │ ├── ohlcBar.js
│ │ ├── pie.js
│ │ ├── pieChart.js
│ │ ├── scatter.js
│ │ ├── scatterChart.js
│ │ ├── scatterPlusLineChart.js
│ │ ├── sparkline.js
│ │ ├── sparklinePlus.js
│ │ ├── stackedArea.js
│ │ └── stackedAreaChart.js
│ │ ├── nv.d3.css
│ │ ├── outro.js
│ │ ├── tooltip.js
│ │ └── utils.js
│ └── templates
│ ├── layout.html
│ ├── macros.jnj
│ └── partials
│ ├── address.html
│ ├── email.html
│ ├── emails.html
│ └── sent_distribution.html
├── ch07
├── README.md
├── mongo
│ ├── mongo.js
│ └── sent_distribution_fix_mongo.js
├── pig
│ ├── lda.pig
│ ├── network.pig
│ ├── ntfidf.macro
│ ├── process_mcl.pig
│ ├── process_topics.pig
│ ├── publish_topics_per_email.pig
│ ├── related_email_addresses.pig
│ ├── sent_distributions_fix.pig
│ ├── test_tokenizers.pig
│ ├── topics.pig
│ └── udfs.py
├── python
│ ├── sent_distribution_fix.py
│ └── token_extractor.py
└── web
│ ├── config.py
│ ├── index.py
│ ├── static
│ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-responsive.css
│ │ │ ├── bootstrap-responsive.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── img
│ │ │ ├── glyphicons-halflings-white.png
│ │ │ └── glyphicons-halflings.png
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3
│ │ ├── d3.v3.js
│ │ └── d3.v3.min.js
│ └── nvd3
│ │ ├── .gitignore
│ │ ├── LICENSE.md
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── build.bat
│ │ ├── build.sh
│ │ ├── deprecated
│ │ ├── bar.html
│ │ ├── bar.js
│ │ ├── charts
│ │ │ ├── cumulativeLineChart.js
│ │ │ ├── discreteBarChart.js
│ │ │ ├── lineChart.js
│ │ │ ├── lineChartDaily.js
│ │ │ └── stackedAreaChart.js
│ │ ├── cumulativeLine.html
│ │ ├── cumulativeLine.js
│ │ ├── discreteBarChartWithEnabledTooltip.html
│ │ ├── discreteBarChartWithEnabledTooltip.js
│ │ ├── discreteBarWithAxes.html
│ │ ├── discreteBarWithAxes.js
│ │ ├── lineChart-old.html
│ │ ├── lineChartDaily.html
│ │ ├── linePlusBar.html
│ │ ├── linePlusBar.js
│ │ ├── lineWithFocus.html
│ │ ├── lineWithFocus.js
│ │ ├── lineWithFourAxes.html
│ │ ├── lineWithFourAxes.js
│ │ ├── lineWithLegend.html
│ │ ├── lineWithLegend.js
│ │ ├── monthendAxis.html
│ │ ├── multiBarHorizontalWithLegend.html
│ │ ├── multiBarHorizontalWithLegend.js
│ │ ├── multiBarWithLegend.html
│ │ ├── multiBarWithLegend.js
│ │ ├── pie.js
│ │ ├── scatterChart.html
│ │ ├── scatterChart.js
│ │ ├── scatterFisheyeChart.js
│ │ ├── scatterWithLegend.html
│ │ ├── scatterWithLegend.js
│ │ ├── stackedArea.js
│ │ ├── stackedAreaChart.html
│ │ ├── stackedAreaChart_old.html
│ │ ├── stackedAreaWithLegend.html
│ │ └── stackedAreaWithLegend.js
│ │ ├── examples
│ │ ├── bullet.html
│ │ ├── bulletChart.html
│ │ ├── crossfilter.html
│ │ ├── crossfilterWithDimentions.html
│ │ ├── crossfilterWithTables.html
│ │ ├── cumulativeLineChart.html
│ │ ├── discreteBarChart.html
│ │ ├── historicalBar.html
│ │ ├── horizon.html
│ │ ├── images
│ │ │ ├── grey-minus.png
│ │ │ └── grey-plus.png
│ │ ├── indentedtree.html
│ │ ├── legend.html
│ │ ├── line.html
│ │ ├── lineChart.html
│ │ ├── lineChartSVGResize.html
│ │ ├── linePlusBarChart.html
│ │ ├── linePlusBarWithFocusChart.html
│ │ ├── lineWithFisheyeChart.html
│ │ ├── lineWithFocusChart.html
│ │ ├── multiBar.html
│ │ ├── multiBarChart.html
│ │ ├── multiBarHorizontalChart.html
│ │ ├── multiChart.html
│ │ ├── nations.json
│ │ ├── pie.html
│ │ ├── pieChart.html
│ │ ├── scatter.html
│ │ ├── scatterChart.html
│ │ ├── scatterPlusLineChart.html
│ │ ├── sparkline.html
│ │ ├── sparklinePlus.html
│ │ ├── stackedArea.html
│ │ ├── stackedAreaChart.html
│ │ └── stream_layers.js
│ │ ├── lib
│ │ ├── cie.js
│ │ ├── crossfilter.js
│ │ ├── crossfilter.min.js
│ │ ├── d3.v2.js
│ │ ├── d3.v2.min.js
│ │ ├── fisheye.js
│ │ ├── hive.js
│ │ ├── horizon.js
│ │ └── sankey.js
│ │ ├── nv.d3.js
│ │ ├── nv.d3.min.js
│ │ └── src
│ │ ├── core.js
│ │ ├── intro.js
│ │ ├── models
│ │ ├── axis.js
│ │ ├── backup
│ │ │ ├── bullet.js
│ │ │ └── bulletChart.js
│ │ ├── bullet.js
│ │ ├── bulletChart.js
│ │ ├── cumulativeLineChart.js
│ │ ├── discreteBar.js
│ │ ├── discreteBarChart.js
│ │ ├── distribution.js
│ │ ├── historicalBar.js
│ │ ├── indentedTree.js
│ │ ├── legend.js
│ │ ├── line.js
│ │ ├── lineChart.js
│ │ ├── linePlusBarChart.js
│ │ ├── linePlusBarWithFocusChart.js
│ │ ├── lineWithFisheye.js
│ │ ├── lineWithFisheyeChart.js
│ │ ├── lineWithFocusChart.js
│ │ ├── multiBar.js
│ │ ├── multiBarChart.js
│ │ ├── multiBarHorizontal.js
│ │ ├── multiBarHorizontalChart.js
│ │ ├── multiBarTimeSeries.js
│ │ ├── multiBarTimeSeriesChart.js
│ │ ├── multiChart.js
│ │ ├── ohlcBar.js
│ │ ├── pie.js
│ │ ├── pieChart.js
│ │ ├── scatter.js
│ │ ├── scatterChart.js
│ │ ├── scatterPlusLineChart.js
│ │ ├── sparkline.js
│ │ ├── sparklinePlus.js
│ │ ├── stackedArea.js
│ │ └── stackedAreaChart.js
│ │ ├── nv.d3.css
│ │ ├── outro.js
│ │ ├── tooltip.js
│ │ └── utils.js
│ └── templates
│ ├── layout.html
│ ├── macros.jnj
│ └── partials
│ ├── address.html
│ ├── email.html
│ ├── emails.html
│ └── sent_distribution.html
├── ch08
├── README.md
├── mongo.js
├── p_reply_given_from_to.pig
└── web
│ ├── config.py
│ ├── index.py
│ ├── smoother.py
│ ├── static
│ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-responsive.css
│ │ │ ├── bootstrap-responsive.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── img
│ │ │ ├── glyphicons-halflings-white.png
│ │ │ └── glyphicons-halflings.png
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3
│ │ ├── d3.v3.js
│ │ └── d3.v3.min.js
│ └── nvd3
│ │ ├── .gitignore
│ │ ├── LICENSE.md
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── build.bat
│ │ ├── build.sh
│ │ ├── deprecated
│ │ ├── bar.html
│ │ ├── bar.js
│ │ ├── charts
│ │ │ ├── cumulativeLineChart.js
│ │ │ ├── discreteBarChart.js
│ │ │ ├── lineChart.js
│ │ │ ├── lineChartDaily.js
│ │ │ └── stackedAreaChart.js
│ │ ├── cumulativeLine.html
│ │ ├── cumulativeLine.js
│ │ ├── discreteBarChartWithEnabledTooltip.html
│ │ ├── discreteBarChartWithEnabledTooltip.js
│ │ ├── discreteBarWithAxes.html
│ │ ├── discreteBarWithAxes.js
│ │ ├── lineChart-old.html
│ │ ├── lineChartDaily.html
│ │ ├── linePlusBar.html
│ │ ├── linePlusBar.js
│ │ ├── lineWithFocus.html
│ │ ├── lineWithFocus.js
│ │ ├── lineWithFourAxes.html
│ │ ├── lineWithFourAxes.js
│ │ ├── lineWithLegend.html
│ │ ├── lineWithLegend.js
│ │ ├── monthendAxis.html
│ │ ├── multiBarHorizontalWithLegend.html
│ │ ├── multiBarHorizontalWithLegend.js
│ │ ├── multiBarWithLegend.html
│ │ ├── multiBarWithLegend.js
│ │ ├── pie.js
│ │ ├── scatterChart.html
│ │ ├── scatterChart.js
│ │ ├── scatterFisheyeChart.js
│ │ ├── scatterWithLegend.html
│ │ ├── scatterWithLegend.js
│ │ ├── stackedArea.js
│ │ ├── stackedAreaChart.html
│ │ ├── stackedAreaChart_old.html
│ │ ├── stackedAreaWithLegend.html
│ │ └── stackedAreaWithLegend.js
│ │ ├── examples
│ │ ├── bullet.html
│ │ ├── bulletChart.html
│ │ ├── crossfilter.html
│ │ ├── crossfilterWithDimentions.html
│ │ ├── crossfilterWithTables.html
│ │ ├── cumulativeLineChart.html
│ │ ├── discreteBarChart.html
│ │ ├── historicalBar.html
│ │ ├── horizon.html
│ │ ├── images
│ │ │ ├── grey-minus.png
│ │ │ └── grey-plus.png
│ │ ├── indentedtree.html
│ │ ├── legend.html
│ │ ├── line.html
│ │ ├── lineChart.html
│ │ ├── lineChartSVGResize.html
│ │ ├── linePlusBarChart.html
│ │ ├── linePlusBarWithFocusChart.html
│ │ ├── lineWithFisheyeChart.html
│ │ ├── lineWithFocusChart.html
│ │ ├── multiBar.html
│ │ ├── multiBarChart.html
│ │ ├── multiBarHorizontalChart.html
│ │ ├── multiChart.html
│ │ ├── nations.json
│ │ ├── pie.html
│ │ ├── pieChart.html
│ │ ├── scatter.html
│ │ ├── scatterChart.html
│ │ ├── scatterPlusLineChart.html
│ │ ├── sparkline.html
│ │ ├── sparklinePlus.html
│ │ ├── stackedArea.html
│ │ ├── stackedAreaChart.html
│ │ └── stream_layers.js
│ │ ├── lib
│ │ ├── cie.js
│ │ ├── crossfilter.js
│ │ ├── crossfilter.min.js
│ │ ├── d3.v2.js
│ │ ├── d3.v2.min.js
│ │ ├── fisheye.js
│ │ ├── hive.js
│ │ ├── horizon.js
│ │ └── sankey.js
│ │ ├── nv.d3.js
│ │ ├── nv.d3.min.js
│ │ └── src
│ │ ├── core.js
│ │ ├── intro.js
│ │ ├── models
│ │ ├── axis.js
│ │ ├── backup
│ │ │ ├── bullet.js
│ │ │ └── bulletChart.js
│ │ ├── bullet.js
│ │ ├── bulletChart.js
│ │ ├── cumulativeLineChart.js
│ │ ├── discreteBar.js
│ │ ├── discreteBarChart.js
│ │ ├── distribution.js
│ │ ├── historicalBar.js
│ │ ├── indentedTree.js
│ │ ├── legend.js
│ │ ├── line.js
│ │ ├── lineChart.js
│ │ ├── linePlusBarChart.js
│ │ ├── linePlusBarWithFocusChart.js
│ │ ├── lineWithFisheye.js
│ │ ├── lineWithFisheyeChart.js
│ │ ├── lineWithFocusChart.js
│ │ ├── multiBar.js
│ │ ├── multiBarChart.js
│ │ ├── multiBarHorizontal.js
│ │ ├── multiBarHorizontalChart.js
│ │ ├── multiBarTimeSeries.js
│ │ ├── multiBarTimeSeriesChart.js
│ │ ├── multiChart.js
│ │ ├── ohlcBar.js
│ │ ├── pie.js
│ │ ├── pieChart.js
│ │ ├── scatter.js
│ │ ├── scatterChart.js
│ │ ├── scatterPlusLineChart.js
│ │ ├── sparkline.js
│ │ ├── sparklinePlus.js
│ │ ├── stackedArea.js
│ │ └── stackedAreaChart.js
│ │ ├── nv.d3.css
│ │ ├── outro.js
│ │ ├── tooltip.js
│ │ └── utils.js
│ └── templates
│ ├── layout.html
│ ├── macros.jnj
│ └── partials
│ ├── address.html
│ ├── email.html
│ ├── emails.html
│ └── sent_distribution.html
├── ch09
├── README.md
├── mongo.js
├── pig
│ ├── hamming.py
│ ├── p_reply_given_from_to.pig
│ ├── p_reply_given_time_of_day.pig
│ ├── p_reply_given_topics.pig
│ ├── publish_topics.pig
│ ├── smooth_times.pig
│ ├── test_results.pig
│ └── udfs.py
├── tune_weights.py
└── web
│ ├── config.py
│ ├── index.py
│ ├── smoother.py
│ ├── static
│ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-responsive.css
│ │ │ ├── bootstrap-responsive.min.css
│ │ │ ├── bootstrap.css
│ │ │ └── bootstrap.min.css
│ │ ├── img
│ │ │ ├── glyphicons-halflings-white.png
│ │ │ └── glyphicons-halflings.png
│ │ └── js
│ │ │ ├── bootstrap.js
│ │ │ └── bootstrap.min.js
│ ├── d3
│ │ ├── d3.v3.js
│ │ └── d3.v3.min.js
│ └── nvd3
│ │ ├── .gitignore
│ │ ├── LICENSE.md
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── build.bat
│ │ ├── build.sh
│ │ ├── deprecated
│ │ ├── bar.html
│ │ ├── bar.js
│ │ ├── charts
│ │ │ ├── cumulativeLineChart.js
│ │ │ ├── discreteBarChart.js
│ │ │ ├── lineChart.js
│ │ │ ├── lineChartDaily.js
│ │ │ └── stackedAreaChart.js
│ │ ├── cumulativeLine.html
│ │ ├── cumulativeLine.js
│ │ ├── discreteBarChartWithEnabledTooltip.html
│ │ ├── discreteBarChartWithEnabledTooltip.js
│ │ ├── discreteBarWithAxes.html
│ │ ├── discreteBarWithAxes.js
│ │ ├── lineChart-old.html
│ │ ├── lineChartDaily.html
│ │ ├── linePlusBar.html
│ │ ├── linePlusBar.js
│ │ ├── lineWithFocus.html
│ │ ├── lineWithFocus.js
│ │ ├── lineWithFourAxes.html
│ │ ├── lineWithFourAxes.js
│ │ ├── lineWithLegend.html
│ │ ├── lineWithLegend.js
│ │ ├── monthendAxis.html
│ │ ├── multiBarHorizontalWithLegend.html
│ │ ├── multiBarHorizontalWithLegend.js
│ │ ├── multiBarWithLegend.html
│ │ ├── multiBarWithLegend.js
│ │ ├── pie.js
│ │ ├── scatterChart.html
│ │ ├── scatterChart.js
│ │ ├── scatterFisheyeChart.js
│ │ ├── scatterWithLegend.html
│ │ ├── scatterWithLegend.js
│ │ ├── stackedArea.js
│ │ ├── stackedAreaChart.html
│ │ ├── stackedAreaChart_old.html
│ │ ├── stackedAreaWithLegend.html
│ │ └── stackedAreaWithLegend.js
│ │ ├── examples
│ │ ├── bullet.html
│ │ ├── bulletChart.html
│ │ ├── crossfilter.html
│ │ ├── crossfilterWithDimentions.html
│ │ ├── crossfilterWithTables.html
│ │ ├── cumulativeLineChart.html
│ │ ├── discreteBarChart.html
│ │ ├── historicalBar.html
│ │ ├── horizon.html
│ │ ├── images
│ │ │ ├── grey-minus.png
│ │ │ └── grey-plus.png
│ │ ├── indentedtree.html
│ │ ├── legend.html
│ │ ├── line.html
│ │ ├── lineChart.html
│ │ ├── lineChartSVGResize.html
│ │ ├── linePlusBarChart.html
│ │ ├── linePlusBarWithFocusChart.html
│ │ ├── lineWithFisheyeChart.html
│ │ ├── lineWithFocusChart.html
│ │ ├── multiBar.html
│ │ ├── multiBarChart.html
│ │ ├── multiBarHorizontalChart.html
│ │ ├── multiChart.html
│ │ ├── nations.json
│ │ ├── pie.html
│ │ ├── pieChart.html
│ │ ├── scatter.html
│ │ ├── scatterChart.html
│ │ ├── scatterPlusLineChart.html
│ │ ├── sparkline.html
│ │ ├── sparklinePlus.html
│ │ ├── stackedArea.html
│ │ ├── stackedAreaChart.html
│ │ └── stream_layers.js
│ │ ├── lib
│ │ ├── cie.js
│ │ ├── crossfilter.js
│ │ ├── crossfilter.min.js
│ │ ├── d3.v2.js
│ │ ├── d3.v2.min.js
│ │ ├── fisheye.js
│ │ ├── hive.js
│ │ ├── horizon.js
│ │ └── sankey.js
│ │ ├── nv.d3.js
│ │ ├── nv.d3.min.js
│ │ └── src
│ │ ├── core.js
│ │ ├── intro.js
│ │ ├── models
│ │ ├── axis.js
│ │ ├── backup
│ │ │ ├── bullet.js
│ │ │ └── bulletChart.js
│ │ ├── bullet.js
│ │ ├── bulletChart.js
│ │ ├── cumulativeLineChart.js
│ │ ├── discreteBar.js
│ │ ├── discreteBarChart.js
│ │ ├── distribution.js
│ │ ├── historicalBar.js
│ │ ├── indentedTree.js
│ │ ├── legend.js
│ │ ├── line.js
│ │ ├── lineChart.js
│ │ ├── linePlusBarChart.js
│ │ ├── linePlusBarWithFocusChart.js
│ │ ├── lineWithFisheye.js
│ │ ├── lineWithFisheyeChart.js
│ │ ├── lineWithFocusChart.js
│ │ ├── multiBar.js
│ │ ├── multiBarChart.js
│ │ ├── multiBarHorizontal.js
│ │ ├── multiBarHorizontalChart.js
│ │ ├── multiBarTimeSeries.js
│ │ ├── multiBarTimeSeriesChart.js
│ │ ├── multiChart.js
│ │ ├── ohlcBar.js
│ │ ├── pie.js
│ │ ├── pieChart.js
│ │ ├── scatter.js
│ │ ├── scatterChart.js
│ │ ├── scatterPlusLineChart.js
│ │ ├── sparkline.js
│ │ ├── sparklinePlus.js
│ │ ├── stackedArea.js
│ │ └── stackedAreaChart.js
│ │ ├── nv.d3.css
│ │ ├── outro.js
│ │ ├── tooltip.js
│ │ └── utils.js
│ └── templates
│ ├── layout.html
│ ├── macros.jnj
│ └── partials
│ ├── address.html
│ ├── email.html
│ ├── emails.html
│ ├── sent_distribution.html
│ └── will_reply.html
├── pigrc
└── requirements.txt
/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt
2 | *.pyc
3 | venv
4 | pyelasticsearch
5 |
--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | web: python ch04/index.py
2 |
--------------------------------------------------------------------------------
/ch02/Email Analysis.xlsb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch02/Email Analysis.xlsb
--------------------------------------------------------------------------------
/ch02/README.md:
--------------------------------------------------------------------------------
1 | Agile Data the Book
2 | ===================
3 |
4 | You can buy the book [here](http://shop.oreilly.com/product/0636920025054.do). You can read the book on [O'Reilly OFPS](http://ofps.oreilly.com/titles/9781449326265/) now. Work the chapter code examples as you go. Don't forget to initialize your python environment. Try linux (apt-get, yum) or OS X (brew, port) packages if any of the requirements don't install in your [virtualenv](http://www.virtualenv.org/en/latest/).
5 |
--------------------------------------------------------------------------------
/ch03/cat_avro:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | #
4 | # derived from example at http://www.harshj.com/2010/04/25/writing-and-reading-avro-data-files-using-python/
5 | #
6 | from avro import schema, datafile, io
7 | import pprint
8 | import sys
9 | import json
10 |
11 | field_id = None
12 | # Optional key to print
13 | if (len(sys.argv) > 2):
14 | field_id = sys.argv[2]
15 |
16 | # Test reading avros
17 | rec_reader = io.DatumReader()
18 |
19 | # Create a 'data file' (avro file) reader
20 | df_reader = datafile.DataFileReader(
21 | open(sys.argv[1]),
22 | rec_reader
23 | )
24 |
25 | # Read all records stored inside
26 | pp = pprint.PrettyPrinter()
27 | i = 0
28 | for record in df_reader:
29 | if i > 20:
30 | break
31 | i += 1
32 | if field_id:
33 | pp.pprint(record[field_id])
34 | else:
35 | pp.pprint(record)
36 |
37 | obj = json.loads(df_reader.meta['avro.schema'])
38 | print "\nAvro Schema: " + json.dumps(obj)
39 |
--------------------------------------------------------------------------------
/ch03/pig/avro_to_mongo.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Load Avro jars and define shortcut */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 | define AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
9 |
10 | /* MongoDB libraries and configuration */
11 | REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
12 | REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
13 | REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
14 |
15 | /* Set speculative execution off so we don't have the chance of duplicate records in Mongo */
16 | set mapred.map.tasks.speculative.execution false
17 | set mapred.reduce.tasks.speculative.execution false
18 | define MongoStorage com.mongodb.hadoop.pig.MongoStorage(); /* Shortcut */
19 |
20 | avros = load '$avros' using AvroStorage(); /* For example, 'enron.avro' */
21 | store avros into '$mongourl' using MongoStorage(); /* For example, 'mongodb://localhost/enron.emails' */
22 |
--------------------------------------------------------------------------------
/ch03/pig/elasticsearch.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 |
11 | /* Elasticsearch's own jars */
12 | REGISTER $HOME/elasticsearch-0.20.2/lib/*.jar
13 |
14 | /* Register wonderdog - elasticsearch integration */
15 | REGISTER $HOME/wonderdog/target/wonderdog-1.0-SNAPSHOT.jar
16 |
17 | /* Remove the old json */
18 | rmf /tmp/sent_count_json
19 |
20 | /* Nuke the elasticsearch sent_counts index, as we are about to replace it. */
21 | sh curl -XDELETE 'http://localhost:9200/inbox/sent_counts'
22 |
23 | /* Load Avros, and store as JSON */
24 | sent_counts = LOAD '/tmp/sent_counts.txt' AS (from:chararray, to:chararray, total:long);
25 | STORE sent_counts INTO '/tmp/sent_count_json' USING JsonStorage();
26 |
27 | /* Now load the JSON as a single chararray field, and index it into ElasticSearch with Wonderdog from InfoChimps */
28 | sent_count_json = LOAD '/tmp/sent_count_json' AS (sent_counts:chararray);
29 | STORE sent_count_json INTO 'es://inbox/sentcounts?json=true&size=1000' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage(
30 | '$HOME/elasticsearch-0.20.2/config/elasticsearch.yml',
31 | '$HOME/elasticsearch-0.20.2/plugins');
32 |
33 | /* Search for Hadoop to make sure we get a hit in our sent_count index */
34 | sh curl -XGET 'http://localhost:9200/inbox/sentcounts/_search?q=russell&pretty=true&size=1'
35 |
--------------------------------------------------------------------------------
/ch03/pig/mongo.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
5 | REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
6 | REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
7 |
8 | set mapred.map.tasks.speculative.execution false
9 | set mapred.reduce.tasks.speculative.execution false
10 |
11 | sent_counts = LOAD '/tmp/sent_counts.txt' AS (from:chararray, to:chararray, total:long);
12 | STORE sent_counts INTO 'mongodb://localhost/agile_data.sent_counts' USING com.mongodb.hadoop.pig.MongoStorage();
13 |
--------------------------------------------------------------------------------
/ch03/pig/sent_counts.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
6 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
7 |
8 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
9 |
10 | rmf /tmp/sent_counts.txt
11 |
12 | /* Load the emails in avro format (edit the path to match where you saved them) using the AvroStorage UDF from Piggybank */
13 | messages = LOAD '/me/Data/test_mbox' USING AvroStorage();
14 |
15 | /* Filter nulls, they won't help */
16 | messages = FILTER messages BY (from IS NOT NULL) AND (tos IS NOT NULL);
17 |
18 | /* Emails can be 'to' more than one person. FLATTEN() will project our from with each 'to' that exists. */
19 | addresses = FOREACH messages GENERATE from.address AS from, FLATTEN(tos.(address)) AS to;
20 |
21 | /* Lowercase the email addresses, so we don't count MiXed case of the same address as multiple addresses */
22 | lowers = FOREACH addresses GENERATE LOWER(from) AS from, LOWER(to) AS to;
23 |
24 | /* GROUP BY each from/to pair into a bag (array), then count the bag's contents ($1 means the 2nd field) to get a total.
25 | Same as SQL: SELECT from, to, COUNT(*) FROM lowers GROUP BY (from, to);
26 | Note: COUNT_STAR differs from COUNT in that it counts nulls. */
27 | by_from_to = GROUP lowers BY (from, to);
28 | sent_counts = FOREACH by_from_to GENERATE FLATTEN(group) AS (from, to), COUNT_STAR(lowers) AS total;
29 |
30 | /* Sort the data, highest sent count first */
31 | sent_counts = ORDER sent_counts BY total DESC;
32 | STORE sent_counts INTO '/tmp/sent_counts.txt';
33 |
--------------------------------------------------------------------------------
/ch03/python/elasticsearch.py:
--------------------------------------------------------------------------------
1 | import pyelasticsearch
2 | elastic = pyelasticsearch.ElasticSearch('http://localhost:9200/inbox')
3 | results = elastic.search("hadoop", index="sentcounts")
4 | print results
5 |
--------------------------------------------------------------------------------
/ch03/python/flask_echo.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | app = Flask(__name__)
3 |
4 | @app.route("/")
5 | def hello(input):
6 | return input
7 |
8 | if __name__ == "__main__":
9 | app.run(debug=True)
10 |
--------------------------------------------------------------------------------
/ch03/python/flask_mongo.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | import pymongo
3 | import json
4 |
5 | # Setup Flask
6 | app = Flask(__name__)
7 |
8 | # Setup Mongo
9 | conn = pymongo.Connection() # defaults to localhost
10 | db = conn.agile_data
11 | sent_counts = db['sent_counts']
12 |
13 | # Fetch from/to totals, given a pair of email addresses
14 | @app.route("/sent_counts//")
15 | def sent_count(from_address, to_address):
16 | sent_count = sent_counts.find_one( {'from': from_address, 'to': to_address} )
17 | return json.dumps( {'from': sent_count['from'], 'to': sent_count['to'], 'total': sent_count['total']} )
18 |
19 | if __name__ == "__main__":
20 | app.run(debug=True)
21 |
--------------------------------------------------------------------------------
/ch03/python/mongo.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | import json
3 |
4 | conn = pymongo.Connection() # defaults to localhost
5 | db = conn.agile_data
6 | results = db['sent_counts'].find()
7 | for i in range(0, results.count()): # Loop and print all results
8 | print results[i]
9 |
10 |
--------------------------------------------------------------------------------
/ch03/python/test_avro.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | #
4 | # derived from helpful example at http://www.harshj.com/2010/04/25/writing-and-reading-avro-data-files-using-python/
5 | #
6 | from avro import schema, datafile, io
7 | import pprint
8 |
9 | # Test writing avros
10 | OUTFILE_NAME = '/tmp/messages.avro'
11 |
12 | SCHEMA_STR = """{
13 | "type": "record",
14 | "name": "Message",
15 | "fields" : [
16 | {"name": "message_id", "type": "int"},
17 | {"name": "topic", "type": "string"},
18 | {"name": "user_id", "type": "int"}
19 | ]
20 | }"""
21 |
22 | SCHEMA = schema.parse(SCHEMA_STR)
23 |
24 | # Create a 'record' (datum) writer
25 | rec_writer = io.DatumWriter(SCHEMA)
26 |
27 | # Create a 'data file' (avro file) writer
28 | df_writer = datafile.DataFileWriter(
29 | open(OUTFILE_NAME, 'wb'),
30 | rec_writer,
31 | writers_schema = SCHEMA
32 | )
33 |
34 | df_writer.append( {"message_id": 11, "topic": "Hello galaxy", "user_id": 1} )
35 | df_writer.append( {"message_id": 12, "topic": "Jim is silly!", "user_id": 1} )
36 | df_writer.append( {"message_id": 23, "topic": "I like apples.", "user_id": 2} )
37 | df_writer.close()
38 |
39 | # Test reading avros
40 | rec_reader = io.DatumReader()
41 |
42 | # Create a 'data file' (avro file) reader
43 | df_reader = datafile.DataFileReader(
44 | open(OUTFILE_NAME),
45 | rec_reader
46 | )
47 |
48 | # Read all records stored inside
49 | pp = pprint.PrettyPrinter()
50 | for record in df_reader:
51 | pp.pprint(record)
52 |
--------------------------------------------------------------------------------
/ch03/web/index.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template
2 | import pymongo
3 | import json
4 | import re
5 |
6 | # Setup Flask
7 | app = Flask(__name__)
8 |
9 | # Setup Mongo
10 | conn = pymongo.Connection() # defaults to localhost
11 | db = conn.agile_data
12 |
13 | # Fetch from/to totals and list them
14 | @app.route("/sent_counts")
15 | def sent_counts():
16 | sent_counts = db['sent_counts'].find()
17 | results = {}
18 | results['keys'] = 'from', 'to', 'total'
19 | results['values'] = [[s['from'], s['to'], s['total']] for s in sent_counts if re.search('apache', str(s['from'])) or re.search('apache', str(s['to']))]
20 | results['values'] = results['values'][0:17]
21 | return render_template('table.html', results=results)
22 |
23 | if __name__ == "__main__":
24 | app.run(debug=True)
25 |
--------------------------------------------------------------------------------
/ch03/web/static/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch03/web/static/bootstrap/img/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/ch03/web/static/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch03/web/static/bootstrap/img/glyphicons-halflings.png
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Jekyll Files #
3 | ################
4 | _site
5 |
6 |
7 | # Random Files #
8 | ################
9 | *.swp
10 | *~
11 | *.log
12 |
13 |
14 | # Private Test Data #
15 | #####################
16 | *REALDATA*
17 |
18 |
19 | # OS generated files #
20 | ######################
21 | .DS_Store*
22 | ehthumbs.db
23 | Icon?
24 | Thumbs.db
25 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/LICENSE.md:
--------------------------------------------------------------------------------
1 |
2 | ##nvd3.js License
3 |
4 | Copyright (c) 2011, 2012 [Novus Partners, Inc.][novus]
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 |
18 | [novus]: https://www.novus.com/
19 |
20 |
21 |
22 | ##d3.js License
23 |
24 | Copyright (c) 2012, Michael Bostock
25 | All rights reserved.
26 |
27 | Redistribution and use in source and binary forms, with or without
28 | modification, are permitted provided that the following conditions are met:
29 |
30 | * Redistributions of source code must retain the above copyright notice, this
31 | list of conditions and the following disclaimer.
32 |
33 | * Redistributions in binary form must reproduce the above copyright notice,
34 | this list of conditions and the following disclaimer in the documentation
35 | and/or other materials provided with the distribution.
36 |
37 | * The name Michael Bostock may not be used to endorse or promote products
38 | derived from this software without specific prior written permission.
39 |
40 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
41 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
44 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
45 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
47 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
49 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/Makefile:
--------------------------------------------------------------------------------
1 | JS_FILES = \
2 | src/intro.js \
3 | src/core.js \
4 | src/tooltip.js \
5 | src/utils.js \
6 | src/models/axis.js \
7 | src/models/historicalBar.js \
8 | src/models/bullet.js \
9 | src/models/bulletChart.js \
10 | src/models/cumulativeLineChart.js \
11 | src/models/discreteBar.js \
12 | src/models/discreteBarChart.js \
13 | src/models/distribution.js \
14 | src/models/indentedTree.js \
15 | src/models/legend.js \
16 | src/models/line.js \
17 | src/models/lineChart.js \
18 | src/models/linePlusBarChart.js \
19 | src/models/lineWithFocusChart.js \
20 | src/models/multiBar.js \
21 | src/models/multiBarChart.js \
22 | src/models/multiBarHorizontal.js \
23 | src/models/multiBarHorizontalChart.js \
24 | src/models/multiChart.js \
25 | src/models/ohlcBar.js \
26 | src/models/pie.js \
27 | src/models/pieChart.js \
28 | src/models/scatter.js \
29 | src/models/scatterChart.js \
30 | src/models/scatterPlusLineChart.js \
31 | src/models/sparkline.js \
32 | src/models/sparklinePlus.js \
33 | src/models/stackedArea.js \
34 | src/models/stackedAreaChart.js \
35 | src/outro.js
36 |
37 | JS_COMPILER = \
38 | uglifyjs
39 |
40 | all: nv.d3.js nv.d3.min.js
41 | nv.d3.js: $(JS_FILES)
42 | nv.d3.min.js: $(JS_FILES)
43 |
44 | nv.d3.js: Makefile
45 | rm -f $@
46 | cat $(filter %.js,$^) >> $@
47 |
48 | %.min.js:: Makefile
49 | rm -f $@
50 | cat $(filter %.js,$^) | $(JS_COMPILER) >> $@
51 |
52 | clean:
53 | rm -rf nv.d3.js nv.d3.min.js
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/README.md:
--------------------------------------------------------------------------------
1 | Please see Novus' official statement on nvd3 with an explanation,
2 | apology, and commitment to its permanent status as an open-source
3 | project.
4 | [http://nvd3.org/statement.html](http://nvd3.org/statement.html)
5 |
6 | # nvd3 - v0.0.1
7 |
8 | A reusable chart library for d3.JS.
9 |
10 | Currently in an early stage of development, but will be a very active project. It may change quite a bit from its current state, but will always try to follow the style in which d3.js was done.
11 |
12 | You can also check out the [examples page](http://nvd3.org/ghpages/examples.html)
13 |
14 | ---
15 |
16 | If one of [the existing models](https://github.com/novus/nvd3/tree/master/src/models) doesn't meet your needs, fork the project, implement the model and an example using it, send us a pull request, for consideration for inclusion in the project.
17 |
18 | ---
19 |
20 | Minifying your fork:
21 |
22 | The Makefile requires [UglifyJS](https://github.com/mishoo/UglifyJS).
23 |
24 | The easist way to install is to install via npm. Run `npm install
25 | uglify-js` from your home directory, then add the output from `npm bin`
26 | into your path so that you have access to `uglifyjs` from the command
27 | line (remember to restart your terminal window when adding to the path.)
28 |
29 | Once you have `uglifyjs` command available, running `make` from your
30 | fork's root directory will rebuild both `nv.d3.js` and `nv.d3.min.js`.
31 |
32 | Without UglifyJS, you won't get the minified version when running make.
33 |
34 | **We ask that you DO NOT minify pull requests...
35 | If you need to minify please build pull request in separate branch, and
36 | merge and minify in yout master.
37 |
38 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | copy src\intro.js /B + src\core.js /B + src\tooltip.js /B temp1.js /B
3 | copy src\models\*.js /B temp2.js /B
4 | copy temp1.js /B + temp2.js /B + src\outro.js /B nv.d3.js /B
5 | del temp1.js
6 | del temp2.js
7 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | COMPRESSOR=`which yui-compressor`
3 | cat src/intro.js src/core.js src/tooltip.js src/utils.js src/models/*.js src/outro.js > nv.d3.js
4 | if [ -e $COMPRESSOR ]; then
5 | $COMPRESSOR --type js -o nv.d3.min.js nv.d3.js
6 | fi
7 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/deprecated/lineChart-old.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
84 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/images/grey-minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch03/web/static/nvd3/examples/images/grey-minus.png
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/images/grey-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch03/web/static/nvd3/examples/images/grey-plus.png
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/legend.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
76 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/line.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
96 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/lineWithFocusChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
88 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/multiBar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
93 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/multiBarChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
81 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/pie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
94 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/sparkline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
21 |
22 |
23 | Sparkline:
24 |
25 |
26 |
27 |
28 |
29 |
63 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/sparklinePlus.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
20 | SparklinePlus:
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
68 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/examples/stream_layers.js:
--------------------------------------------------------------------------------
1 |
2 | /* Inspired by Lee Byron's test data generator. */
3 | function stream_layers(n, m, o) {
4 | if (arguments.length < 3) o = 0;
5 | function bump(a) {
6 | var x = 1 / (.1 + Math.random()),
7 | y = 2 * Math.random() - .5,
8 | z = 10 / (.1 + Math.random());
9 | for (var i = 0; i < m; i++) {
10 | var w = (i / m - y) * z;
11 | a[i] += x * Math.exp(-w * w);
12 | }
13 | }
14 | return d3.range(n).map(function() {
15 | var a = [], i;
16 | for (i = 0; i < m; i++) a[i] = o + o * Math.random();
17 | for (i = 0; i < 5; i++) bump(a);
18 | return a.map(stream_index);
19 | });
20 | }
21 |
22 | /* Another layer generator using gamma distributions. */
23 | function stream_waves(n, m) {
24 | return d3.range(n).map(function(i) {
25 | return d3.range(m).map(function(j) {
26 | var x = 20 * j / m - i / 3;
27 | return 2 * x * Math.exp(-.5 * x);
28 | }).map(stream_index);
29 | });
30 | }
31 |
32 | function stream_index(d, i) {
33 | return {x: i, y: Math.max(0, d)};
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/src/intro.js:
--------------------------------------------------------------------------------
1 | (function(){
2 |
--------------------------------------------------------------------------------
/ch03/web/static/nvd3/src/outro.js:
--------------------------------------------------------------------------------
1 | })();
--------------------------------------------------------------------------------
/ch04/.dotcloud/config:
--------------------------------------------------------------------------------
1 | {
2 | "push_branch": null,
3 | "application": "testola",
4 | "version": "0.9.4",
5 | "push_protocol": "rsync"
6 | }
--------------------------------------------------------------------------------
/ch04/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch04/__init__.py
--------------------------------------------------------------------------------
/ch04/dotcloud.yml:
--------------------------------------------------------------------------------
1 | www:
2 | type: python
3 | systempackages:
4 | - libatlas-base-dev
5 | - gfortran
6 | - libsnappy1
7 | - libsnappy-dev
8 | data:
9 | type: mongodb
--------------------------------------------------------------------------------
/ch04/index.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | import os
3 |
4 | # Setup Flask
5 | app = Flask(__name__)
6 |
7 | # Simple echo service
8 | @app.route("/")
9 | def hello(input):
10 | return input
11 |
12 | if __name__ == "__main__":
13 | port = int(os.environ.get('PORT', 5000))
14 | app.run(host='0.0.0.0', port=port)
15 |
--------------------------------------------------------------------------------
/ch04/requirements.txt:
--------------------------------------------------------------------------------
1 | ##################################################################################################
2 | # 2013.12.31 - Added requirements.txt to allow dotcloud to build and obtain all dependent package
3 | ##################################################################################################
4 |
5 | #BareNecessities==0.2.8
6 | #ESClient==0.5.3
7 | Flask==0.9
8 | Jinja2==2.6
9 | ##LEPL==5.1.3
10 | ##Mail==2.1.0
11 | #Werkzeug==0.8.3
12 | ##distribute==0.6.31
13 | ##python-snappy
14 | ##avro==1.7.3
15 | -e git+https://github.com/rhec/pyelasticsearch.git#egg=pyelasticsearch
16 | pymongo==2.4.1
17 | requests==1.0.4
18 | simplejson==2.6.2
19 | wsgiref==0.1.2
20 | ##numpy
21 | ##honcho
22 | ##scipy
23 | dotcloud
24 | ##python-dateutil
25 | ##nltk
26 |
--------------------------------------------------------------------------------
/ch04/wsgi.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('/home/dotcloud/current')
3 | from index import app as application
--------------------------------------------------------------------------------
/ch05/README.md:
--------------------------------------------------------------------------------
1 | Agile Data the Book
2 | ===================
3 |
4 | You can buy the book [here](http://shop.oreilly.com/product/0636920025054.do). You can read the book on [O'Reilly OFPS](http://ofps.oreilly.com/titles/9781449326265/) now. Work the chapter code examples as you go. Don't forget to initialize your python environment. Try linux (apt-get, yum) or OS X (brew, port) packages if any of the requirements don't install in your [virtualenv](http://www.virtualenv.org/en/latest/).
5 |
6 | Agile Data - Chapter 5: Collecting and Displaying Atomic Records
7 | ===============================================================
8 |
9 | ## Setup Python Virtual Environment ##
10 |
11 | ```
12 | # From project root
13 |
14 | # Setup python virtualenv
15 | virtualenv -p `which python2.7` venv --distribute
16 | source venv/bin/activate
17 | pip install -r requirements.txt
18 | ```
19 |
20 | ## Store Emails in MongoDB ##
21 |
22 | ```
23 | pig -l /tmp -x local -param avros= -param mongourl=mongodb://localhost/agile_data.emails -v -w avro_to_mongo.pig
24 | ```
25 |
26 | ## Create the date and message_id indexes in MongoDB ##
27 |
28 | ```
29 | mongo < list_emails.mongo.js
30 | ```
31 |
32 | Or paste that file into the mongo shell.
33 |
34 | ## Access Emails from Python ##
35 |
36 | To test the 'pymongo' module by listing emails, run:
37 |
38 | ```
39 | python ./mongo_list.py
40 | ```
41 |
42 | ## Store Emails in ElasticSearch ##
43 |
44 | pig -l /tmp -x local -v -w ./elasticsearch.pig
45 |
46 | ## Search Emails from Python ##
47 |
48 | Test pyelastic and the ElasticSearch query/sort APIs via:
49 |
50 | ```
51 | python elasticsearch.py
52 | ```
53 |
54 | ## Run Inbox Application ##
55 |
56 | Finally, run our Python/Flask web application.
57 |
58 | ```
59 | python web/index.py
60 | ```
61 |
62 |
--------------------------------------------------------------------------------
/ch05/avro_to_mongo.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Load Avro jars and define shortcut */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 | define AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
9 |
10 | /* MongoDB libraries and configuration */
11 | REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
12 | REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
13 | REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
14 |
15 | set mapred.map.tasks.speculative.execution false
16 | set mapred.reduce.tasks.speculative.execution false
17 |
18 | /* Set speculative execution off so we don't have the chance of duplicate records in Mongo */
19 | set mapred.map.tasks.speculative.execution false
20 | set mapred.reduce.tasks.speculative.execution false
21 | define MongoStorage com.mongodb.hadoop.pig.MongoStorage(); /* Shortcut */
22 |
23 | avros = load '$avros' using AvroStorage(); /* For example, 'enron.avro' */
24 | store avros into '$mongourl' using MongoStorage(); /* For example, 'mongodb://localhost/enron.emails' */
25 |
--------------------------------------------------------------------------------
/ch05/elasticsearch.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 |
11 | /* Elasticsearch's own jars */
12 | REGISTER $HOME/elasticsearch-0.20.2/lib/*.jar
13 |
14 | /* Register wonderdog - elasticsearch integration */
15 | REGISTER $HOME/wonderdog/target/wonderdog-1.0-SNAPSHOT.jar
16 |
17 | /* Remove the old email json */
18 | rmf /tmp/inbox_json
19 |
20 | /* Nuke the elasticsearch emails index, as we are about to replace it. */
21 | sh curl -XDELETE 'http://localhost:9200/inbox/emails'
22 |
23 | /* Load Avros, and store as JSON */
24 | emails = LOAD '/me/Data/test_mbox' USING AvroStorage();
25 | STORE emails INTO '/tmp/inbox_json' USING JsonStorage();
26 |
27 | /* Now load the JSON as a single chararray field, and index it into ElasticSearch with Wonderdog from InfoChimps */
28 | email_json = LOAD '/tmp/inbox_json' AS (email:chararray);
29 | STORE email_json INTO 'es://inbox/emails?json=true&size=1000' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage(
30 | '$HOME/elasticsearch-0.20.2/config/elasticsearch.yml',
31 | '$HOME/elasticsearch-0.20.2/plugins');
32 |
33 | /* Search for Hadoop to make sure we get a hit in our email index */
34 | sh curl -XGET 'http://localhost:9200/inbox/emails/_search?q=hadoop&pretty=true&size=1'
35 |
--------------------------------------------------------------------------------
/ch05/elasticsearch.py:
--------------------------------------------------------------------------------
1 | import pyelasticsearch
2 | elastic = pyelasticsearch.ElasticSearch('http://localhost:9200/inbox')
3 | results = elastic.search("hadoop", index="emails")
4 | print results
5 | results2 = elastic.search({'query': {"term": { "body": query}}, 'from': 0, 'size': 20}, index="emails")
6 | print results2
7 |
8 |
9 |
--------------------------------------------------------------------------------
/ch05/list_emails.mongo.js:
--------------------------------------------------------------------------------
1 | use agile_data;
2 | show collections;
3 | db.emails.findOne();
4 | db.emails.find();
5 | db.emails.find().sort({date: 1});
6 | // error: {
7 | // "$err" : "too much data for sort() with no index. add an index or specify a smaller limit",
8 | // "code" : 10128
9 | }
10 | db.emails.getIndexes();
11 | // [
12 | // {
13 | // "v" : 1,
14 | // "key" : {
15 | // "_id" : 1
16 | // },
17 | // "ns" : "agile_data.emails",
18 | // "name" : "_id_"
19 | // }
20 | // ]
21 | db.emails.ensureIndex({date: 1}); // Add an index on date
22 | db.emails.getIndexes();
23 | // [
24 | // {
25 | // "v" : 1,
26 | // "key" : {
27 | // "_id" : 1
28 | // },
29 | // "ns" : "agile_data.emails",
30 | // "name" : "_id_"
31 | // },
32 | // {
33 | // "v" : 1,
34 | // "key" : {
35 | // "date" : 1
36 | // },
37 | // "ns" : "agile_data.emails",
38 | // "name" : "date_1"
39 | // }
40 | // ]
41 | db.emails.find().sort({date: 1});
42 | // ... lots of sorted emails ...
43 | db.emails.ensureIndex({message_id: 1}); // Add message_id index
44 | db.emails.getIndexes();
45 | // [
46 | // {
47 | // "v" : 1,
48 | // "key" : {
49 | // "_id" : 1
50 | // },
51 | // "ns" : "agile_data.emails",
52 | // "name" : "_id_"
53 | // },
54 | // {
55 | // "v" : 1,
56 | // "key" : {
57 | // "date" : 1
58 | // },
59 | // "ns" : "agile_data.emails",
60 | // "name" : "date_1"
61 | // },
62 | // {
63 | // "v" : 1,
64 | // "key" : {
65 | // "message_id" : 1
66 | // },
67 | // "ns" : "agile_data.emails",
68 | // "name" : "message_id_1"
69 | // }
70 | // ]
71 | db.emails.find().sort({date:0}).limit(10).pretty(); // Fetch last 10 emails, pretty format
72 |
--------------------------------------------------------------------------------
/ch05/mongo_list.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 |
3 | # Setup Mongo
4 | conn = pymongo.Connection() # defaults to localhost
5 | db = conn.agile_data
6 | emails = db['emails']
7 |
8 | email_list = emails.find()[0:20]
9 | for email in email_list:
10 | print email
11 |
--------------------------------------------------------------------------------
/ch05/web/config.py:
--------------------------------------------------------------------------------
1 | EMAILS_PER_PAGE=16
2 | ELASTIC_URL='http://localhost:9200/inbox'
--------------------------------------------------------------------------------
/ch05/web/static/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch05/web/static/bootstrap/img/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/ch05/web/static/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch05/web/static/bootstrap/img/glyphicons-halflings.png
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Jekyll Files #
3 | ################
4 | _site
5 |
6 |
7 | # Random Files #
8 | ################
9 | *.swp
10 | *~
11 | *.log
12 |
13 |
14 | # Private Test Data #
15 | #####################
16 | *REALDATA*
17 |
18 |
19 | # OS generated files #
20 | ######################
21 | .DS_Store*
22 | ehthumbs.db
23 | Icon?
24 | Thumbs.db
25 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/LICENSE.md:
--------------------------------------------------------------------------------
1 |
2 | ##nvd3.js License
3 |
4 | Copyright (c) 2011, 2012 [Novus Partners, Inc.][novus]
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 |
18 | [novus]: https://www.novus.com/
19 |
20 |
21 |
22 | ##d3.js License
23 |
24 | Copyright (c) 2012, Michael Bostock
25 | All rights reserved.
26 |
27 | Redistribution and use in source and binary forms, with or without
28 | modification, are permitted provided that the following conditions are met:
29 |
30 | * Redistributions of source code must retain the above copyright notice, this
31 | list of conditions and the following disclaimer.
32 |
33 | * Redistributions in binary form must reproduce the above copyright notice,
34 | this list of conditions and the following disclaimer in the documentation
35 | and/or other materials provided with the distribution.
36 |
37 | * The name Michael Bostock may not be used to endorse or promote products
38 | derived from this software without specific prior written permission.
39 |
40 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
41 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
44 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
45 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
47 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
49 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/Makefile:
--------------------------------------------------------------------------------
1 | JS_FILES = \
2 | src/intro.js \
3 | src/core.js \
4 | src/tooltip.js \
5 | src/utils.js \
6 | src/models/axis.js \
7 | src/models/historicalBar.js \
8 | src/models/bullet.js \
9 | src/models/bulletChart.js \
10 | src/models/cumulativeLineChart.js \
11 | src/models/discreteBar.js \
12 | src/models/discreteBarChart.js \
13 | src/models/distribution.js \
14 | src/models/indentedTree.js \
15 | src/models/legend.js \
16 | src/models/line.js \
17 | src/models/lineChart.js \
18 | src/models/linePlusBarChart.js \
19 | src/models/lineWithFocusChart.js \
20 | src/models/multiBar.js \
21 | src/models/multiBarChart.js \
22 | src/models/multiBarHorizontal.js \
23 | src/models/multiBarHorizontalChart.js \
24 | src/models/multiChart.js \
25 | src/models/ohlcBar.js \
26 | src/models/pie.js \
27 | src/models/pieChart.js \
28 | src/models/scatter.js \
29 | src/models/scatterChart.js \
30 | src/models/scatterPlusLineChart.js \
31 | src/models/sparkline.js \
32 | src/models/sparklinePlus.js \
33 | src/models/stackedArea.js \
34 | src/models/stackedAreaChart.js \
35 | src/outro.js
36 |
37 | JS_COMPILER = \
38 | uglifyjs
39 |
40 | all: nv.d3.js nv.d3.min.js
41 | nv.d3.js: $(JS_FILES)
42 | nv.d3.min.js: $(JS_FILES)
43 |
44 | nv.d3.js: Makefile
45 | rm -f $@
46 | cat $(filter %.js,$^) >> $@
47 |
48 | %.min.js:: Makefile
49 | rm -f $@
50 | cat $(filter %.js,$^) | $(JS_COMPILER) >> $@
51 |
52 | clean:
53 | rm -rf nv.d3.js nv.d3.min.js
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/README.md:
--------------------------------------------------------------------------------
1 | Please see Novus' official statement on nvd3 with an explanation,
2 | apology, and commitment to its permanent status as an open-source
3 | project.
4 | [http://nvd3.org/statement.html](http://nvd3.org/statement.html)
5 |
6 | # nvd3 - v0.0.1
7 |
8 | A reusable chart library for d3.JS.
9 |
10 | Currently in an early stage of development, but will be a very active project. It may change quite a bit from its current state, but will always try to follow the style in which d3.js was done.
11 |
12 | You can also check out the [examples page](http://nvd3.org/ghpages/examples.html)
13 |
14 | ---
15 |
16 | If one of [the existing models](https://github.com/novus/nvd3/tree/master/src/models) doesn't meet your needs, fork the project, implement the model and an example using it, send us a pull request, for consideration for inclusion in the project.
17 |
18 | ---
19 |
20 | Minifying your fork:
21 |
22 | The Makefile requires [UglifyJS](https://github.com/mishoo/UglifyJS).
23 |
24 | The easist way to install is to install via npm. Run `npm install
25 | uglify-js` from your home directory, then add the output from `npm bin`
26 | into your path so that you have access to `uglifyjs` from the command
27 | line (remember to restart your terminal window when adding to the path.)
28 |
29 | Once you have `uglifyjs` command available, running `make` from your
30 | fork's root directory will rebuild both `nv.d3.js` and `nv.d3.min.js`.
31 |
32 | Without UglifyJS, you won't get the minified version when running make.
33 |
34 | **We ask that you DO NOT minify pull requests...
35 | If you need to minify please build pull request in separate branch, and
36 | merge and minify in yout master.
37 |
38 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | copy src\intro.js /B + src\core.js /B + src\tooltip.js /B temp1.js /B
3 | copy src\models\*.js /B temp2.js /B
4 | copy temp1.js /B + temp2.js /B + src\outro.js /B nv.d3.js /B
5 | del temp1.js
6 | del temp2.js
7 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | COMPRESSOR=`which yui-compressor`
3 | cat src/intro.js src/core.js src/tooltip.js src/utils.js src/models/*.js src/outro.js > nv.d3.js
4 | if [ -e $COMPRESSOR ]; then
5 | $COMPRESSOR --type js -o nv.d3.min.js nv.d3.js
6 | fi
7 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/deprecated/lineChart-old.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
84 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/images/grey-minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch05/web/static/nvd3/examples/images/grey-minus.png
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/images/grey-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch05/web/static/nvd3/examples/images/grey-plus.png
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/legend.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
76 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/line.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
96 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/lineWithFocusChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
88 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/multiBar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
93 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/multiBarChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
81 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/pie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
94 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/sparkline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
21 |
22 |
23 | Sparkline:
24 |
25 |
26 |
27 |
28 |
29 |
63 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/sparklinePlus.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
20 | SparklinePlus:
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
68 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/examples/stream_layers.js:
--------------------------------------------------------------------------------
1 |
2 | /* Inspired by Lee Byron's test data generator. */
3 | function stream_layers(n, m, o) {
4 | if (arguments.length < 3) o = 0;
5 | function bump(a) {
6 | var x = 1 / (.1 + Math.random()),
7 | y = 2 * Math.random() - .5,
8 | z = 10 / (.1 + Math.random());
9 | for (var i = 0; i < m; i++) {
10 | var w = (i / m - y) * z;
11 | a[i] += x * Math.exp(-w * w);
12 | }
13 | }
14 | return d3.range(n).map(function() {
15 | var a = [], i;
16 | for (i = 0; i < m; i++) a[i] = o + o * Math.random();
17 | for (i = 0; i < 5; i++) bump(a);
18 | return a.map(stream_index);
19 | });
20 | }
21 |
22 | /* Another layer generator using gamma distributions. */
23 | function stream_waves(n, m) {
24 | return d3.range(n).map(function(i) {
25 | return d3.range(m).map(function(j) {
26 | var x = 20 * j / m - i / 3;
27 | return 2 * x * Math.exp(-.5 * x);
28 | }).map(stream_index);
29 | });
30 | }
31 |
32 | function stream_index(d, i) {
33 | return {x: i, y: Math.max(0, d)};
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/src/intro.js:
--------------------------------------------------------------------------------
1 | (function(){
2 |
--------------------------------------------------------------------------------
/ch05/web/static/nvd3/src/outro.js:
--------------------------------------------------------------------------------
1 | })();
--------------------------------------------------------------------------------
/ch05/web/templates/partials/email.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 | {% macro display_in_reply_to(key, name) %}
9 | {% if email[key] != 'None' -%}
10 |
11 | {{ common.display_label(name)|safe }}
12 | {{ common.display_link(email[key], '/email', email[key])|safe }}
13 |
14 | {% endif -%}
15 | {% endmacro -%}
16 |
17 |
18 | {% macro convert_body(body) -%}
19 | {{ body.replace('\r\n', '
')|safe }}
20 | {% endmacro -%}
21 |
22 |
23 | {% macro display_email_body(record) -%}
24 | {% if(record['body']) -%}
25 |
26 |
27 | {{ convert_body(record['body']) }}
28 |
29 |
30 | {% endif -%}
31 | {% endmacro -%}
32 |
33 |
34 | {% block content -%}
35 |
38 | Email ID: {{email['message_id']}}
39 |
40 | {{ common.display_email_addresses('From', email['from'])|safe }}
41 | {{ common.display_email_addresses('To', email['tos'])|safe }}
42 | {{ common.display_email_addresses('Cc', email['ccs'])|safe }}
43 | {{ common.display_email_addresses('Bcc', email['bccs'])|safe }}
44 | {{ common.display_email_addresses('Reply-To', email['reply_tos'])|safe }}
45 |
46 | {{ display_in_reply_to('in_reply_to', 'In-Reply-To') }}
47 | {{ common.display_field(email['date'], 'Date')|safe }}
48 | {{ common.display_field(email['subject'], 'Subject')|safe }}
49 |
50 | {{ display_email_body(email) }}
51 |
52 | {% endblock -%}
--------------------------------------------------------------------------------
/ch05/web/templates/partials/emails.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 |
9 | {% block content -%}
10 |
13 | Emails
14 |
17 |
18 |
19 |
20 | From |
21 | Subject |
22 | Date |
23 |
24 |
25 | {% for email in emails %}
26 |
27 | {{ common.display_email_address(email['from'])|safe }} |
28 | {{ common.display_link(email['message_id'], '/email', email['subject'])|safe }} |
29 | {{ email['date'] }} |
30 |
31 | {% endfor %}
32 |
33 |
34 | {% if nav_offsets and nav_path -%}
35 | {{ common.display_nav(nav_offsets, nav_path, query)|safe }}
36 | {% endif -%}
37 |
38 | {% endblock -%}
39 |
--------------------------------------------------------------------------------
/ch06/list_addresses.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 |
3 | # Setup Mongo
4 | conn = pymongo.Connection() # defaults to localhost
5 | db = conn.agile_data
6 | addresses_per_email = db['addresses_per_email']
7 |
8 | address_lists = addresses_per_email.find()[0:20]
9 | for addresses in address_lists:
10 | print addresses
11 |
12 | emails_per_address = db['emails_per_address']
13 | email_list = emails_per_address.find_one()
14 | for email in email_list:
15 | print email
16 |
--------------------------------------------------------------------------------
/ch06/mongo.js:
--------------------------------------------------------------------------------
1 | use agile_data
2 | show collections
3 | db.emails_per_address.ensureIndex({address: 1});
4 | db.emails_per_address.findOne()
5 | // {
6 | // "_id" : ObjectId("50f1cfe93004acab8d0340ea"),
7 | // "address" : "user@pig.apache.org",
8 | // "messages" : [
9 | // {
10 | // "message_id" : "2CC96549-8E00-46BF-998E-5606B6952467@gmail.com",
11 | // "subject" : "Re: Group by with count",
12 | // "date" : "2012-12-27T15:36:58"
13 | // },
14 | // {
15 | // "message_id" : "2CC96549-8E00-46BF-998E-5606B6952467@gmail.com",
16 | // "subject" : "Re: Group by with count",
17 | // "date" : "2012-12-27T15:36:58"
18 | // },
19 | // {
20 | // "message_id" : "2CC96549-8E00-46BF-998E-5606B6952467@gmail.com",
21 | // "subject" : "Re: Group by with count",
22 | // "date" : "2012-12-27T15:36:58"
23 | // },
24 | // ...
25 | db.addresses_per_email.ensureIndex({message_id: 1});
26 | db.addresses_per_email.findOne()
27 | // {
28 | // "_id" : ObjectId("50f1d8453004db7be37cffb0"),
29 | // "message_id" : "kl59ip.iuzmp1@",
30 | // "addresses" : [
31 | // {
32 | // "address" : "artifacts@computerhistory.org"
33 | // },
34 | // {
35 | // "address" : "russell.jurney@gmail.com"
36 | // },
37 | // {
38 | // "address" : "russell.jurney@gmail.com"
39 | // }
40 | // ]
41 | // }
42 | db.sent_distributions.ensureIndex({address: 1})
43 | db.sent_distributions.findOne()
44 | // {
45 | // "_id" : ObjectId("50f365ba30042ade8f22cb86"),
46 | // "address" : "russell.jurney@gmail.com",
47 | // "sent_distribution" : [
48 | // {
49 | // "sent_hour" : "00",
50 | // "total" : NumberLong(435)
51 | // },
52 | // {
53 | // "sent_hour" : "01",
54 | // "total" : NumberLong(307)
55 | // },
56 | // ...
57 | // ]
58 | // }
59 |
--------------------------------------------------------------------------------
/ch06/web/config.py:
--------------------------------------------------------------------------------
1 | EMAILS_PER_PAGE=15
2 | ELASTIC_URL='http://localhost:9200/inbox'
--------------------------------------------------------------------------------
/ch06/web/static/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch06/web/static/bootstrap/img/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/ch06/web/static/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch06/web/static/bootstrap/img/glyphicons-halflings.png
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Jekyll Files #
3 | ################
4 | _site
5 |
6 |
7 | # Random Files #
8 | ################
9 | *.swp
10 | *~
11 | *.log
12 |
13 |
14 | # Private Test Data #
15 | #####################
16 | *REALDATA*
17 |
18 |
19 | # OS generated files #
20 | ######################
21 | .DS_Store*
22 | ehthumbs.db
23 | Icon?
24 | Thumbs.db
25 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/LICENSE.md:
--------------------------------------------------------------------------------
1 |
2 | ##nvd3.js License
3 |
4 | Copyright (c) 2011, 2012 [Novus Partners, Inc.][novus]
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 |
18 | [novus]: https://www.novus.com/
19 |
20 |
21 |
22 | ##d3.js License
23 |
24 | Copyright (c) 2012, Michael Bostock
25 | All rights reserved.
26 |
27 | Redistribution and use in source and binary forms, with or without
28 | modification, are permitted provided that the following conditions are met:
29 |
30 | * Redistributions of source code must retain the above copyright notice, this
31 | list of conditions and the following disclaimer.
32 |
33 | * Redistributions in binary form must reproduce the above copyright notice,
34 | this list of conditions and the following disclaimer in the documentation
35 | and/or other materials provided with the distribution.
36 |
37 | * The name Michael Bostock may not be used to endorse or promote products
38 | derived from this software without specific prior written permission.
39 |
40 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
41 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
44 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
45 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
47 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
49 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/Makefile:
--------------------------------------------------------------------------------
1 | JS_FILES = \
2 | src/intro.js \
3 | src/core.js \
4 | src/tooltip.js \
5 | src/utils.js \
6 | src/models/axis.js \
7 | src/models/historicalBar.js \
8 | src/models/bullet.js \
9 | src/models/bulletChart.js \
10 | src/models/cumulativeLineChart.js \
11 | src/models/discreteBar.js \
12 | src/models/discreteBarChart.js \
13 | src/models/distribution.js \
14 | src/models/indentedTree.js \
15 | src/models/legend.js \
16 | src/models/line.js \
17 | src/models/lineChart.js \
18 | src/models/linePlusBarChart.js \
19 | src/models/lineWithFocusChart.js \
20 | src/models/multiBar.js \
21 | src/models/multiBarChart.js \
22 | src/models/multiBarHorizontal.js \
23 | src/models/multiBarHorizontalChart.js \
24 | src/models/multiChart.js \
25 | src/models/ohlcBar.js \
26 | src/models/pie.js \
27 | src/models/pieChart.js \
28 | src/models/scatter.js \
29 | src/models/scatterChart.js \
30 | src/models/scatterPlusLineChart.js \
31 | src/models/sparkline.js \
32 | src/models/sparklinePlus.js \
33 | src/models/stackedArea.js \
34 | src/models/stackedAreaChart.js \
35 | src/outro.js
36 |
37 | JS_COMPILER = \
38 | uglifyjs
39 |
40 | all: nv.d3.js nv.d3.min.js
41 | nv.d3.js: $(JS_FILES)
42 | nv.d3.min.js: $(JS_FILES)
43 |
44 | nv.d3.js: Makefile
45 | rm -f $@
46 | cat $(filter %.js,$^) >> $@
47 |
48 | %.min.js:: Makefile
49 | rm -f $@
50 | cat $(filter %.js,$^) | $(JS_COMPILER) >> $@
51 |
52 | clean:
53 | rm -rf nv.d3.js nv.d3.min.js
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/README.md:
--------------------------------------------------------------------------------
1 | Please see Novus' official statement on nvd3 with an explanation,
2 | apology, and commitment to its permanent status as an open-source
3 | project.
4 | [http://nvd3.org/statement.html](http://nvd3.org/statement.html)
5 |
6 | # nvd3 - v0.0.1
7 |
8 | A reusable chart library for d3.JS.
9 |
10 | Currently in an early stage of development, but will be a very active project. It may change quite a bit from its current state, but will always try to follow the style in which d3.js was done.
11 |
12 | You can also check out the [examples page](http://nvd3.org/ghpages/examples.html)
13 |
14 | ---
15 |
16 | If one of [the existing models](https://github.com/novus/nvd3/tree/master/src/models) doesn't meet your needs, fork the project, implement the model and an example using it, send us a pull request, for consideration for inclusion in the project.
17 |
18 | ---
19 |
20 | Minifying your fork:
21 |
22 | The Makefile requires [UglifyJS](https://github.com/mishoo/UglifyJS).
23 |
24 | The easist way to install is to install via npm. Run `npm install
25 | uglify-js` from your home directory, then add the output from `npm bin`
26 | into your path so that you have access to `uglifyjs` from the command
27 | line (remember to restart your terminal window when adding to the path.)
28 |
29 | Once you have `uglifyjs` command available, running `make` from your
30 | fork's root directory will rebuild both `nv.d3.js` and `nv.d3.min.js`.
31 |
32 | Without UglifyJS, you won't get the minified version when running make.
33 |
34 | **We ask that you DO NOT minify pull requests...
35 | If you need to minify please build pull request in separate branch, and
36 | merge and minify in yout master.
37 |
38 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | copy src\intro.js /B + src\core.js /B + src\tooltip.js /B temp1.js /B
3 | copy src\models\*.js /B temp2.js /B
4 | copy temp1.js /B + temp2.js /B + src\outro.js /B nv.d3.js /B
5 | del temp1.js
6 | del temp2.js
7 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | COMPRESSOR=`which yui-compressor`
3 | cat src/intro.js src/core.js src/tooltip.js src/utils.js src/models/*.js src/outro.js > nv.d3.js
4 | if [ -e $COMPRESSOR ]; then
5 | $COMPRESSOR --type js -o nv.d3.min.js nv.d3.js
6 | fi
7 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/deprecated/lineChart-old.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
84 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/images/grey-minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch06/web/static/nvd3/examples/images/grey-minus.png
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/images/grey-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch06/web/static/nvd3/examples/images/grey-plus.png
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/legend.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
76 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/line.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
96 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/lineWithFocusChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
88 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/multiBar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
93 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/multiBarChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
81 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/pie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
94 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/sparkline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
21 |
22 |
23 | Sparkline:
24 |
25 |
26 |
27 |
28 |
29 |
63 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/sparklinePlus.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
20 | SparklinePlus:
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
68 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/examples/stream_layers.js:
--------------------------------------------------------------------------------
1 |
2 | /* Inspired by Lee Byron's test data generator. */
3 | function stream_layers(n, m, o) {
4 | if (arguments.length < 3) o = 0;
5 | function bump(a) {
6 | var x = 1 / (.1 + Math.random()),
7 | y = 2 * Math.random() - .5,
8 | z = 10 / (.1 + Math.random());
9 | for (var i = 0; i < m; i++) {
10 | var w = (i / m - y) * z;
11 | a[i] += x * Math.exp(-w * w);
12 | }
13 | }
14 | return d3.range(n).map(function() {
15 | var a = [], i;
16 | for (i = 0; i < m; i++) a[i] = o + o * Math.random();
17 | for (i = 0; i < 5; i++) bump(a);
18 | return a.map(stream_index);
19 | });
20 | }
21 |
22 | /* Another layer generator using gamma distributions. */
23 | function stream_waves(n, m) {
24 | return d3.range(n).map(function(i) {
25 | return d3.range(m).map(function(j) {
26 | var x = 20 * j / m - i / 3;
27 | return 2 * x * Math.exp(-.5 * x);
28 | }).map(stream_index);
29 | });
30 | }
31 |
32 | function stream_index(d, i) {
33 | return {x: i, y: Math.max(0, d)};
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/src/intro.js:
--------------------------------------------------------------------------------
1 | (function(){
2 |
--------------------------------------------------------------------------------
/ch06/web/static/nvd3/src/outro.js:
--------------------------------------------------------------------------------
1 | })();
--------------------------------------------------------------------------------
/ch06/web/templates/partials/address.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 |
9 | {% block content -%}
10 | Email Address
11 |
12 |
13 |
14 | Subject |
15 | Date |
16 |
17 |
18 | {% for email in emails %}
19 |
20 | {{ common.display_link(email['message_id'], '/email', email['subject'])|safe }} |
21 | {{ email['date'] }} |
22 |
23 | {% endfor %}
24 |
25 |
26 | {% if nav_offsets and nav_path -%}
27 | {{ common.display_nav(nav_offsets, nav_path, query)|safe }}
28 | {% endif -%}
29 |
30 | {% endblock -%}
31 |
--------------------------------------------------------------------------------
/ch06/web/templates/partials/emails.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 |
9 | {% block content -%}
10 | Emails
11 |
14 |
15 |
16 |
17 | From |
18 | Subject |
19 | Date |
20 |
21 |
22 | {% for email in emails %}
23 |
24 | {{ common.display_email_address(email['from'])|safe }} |
25 | {{ common.display_link(email['message_id'], '/email', email['subject'])|safe }} |
26 | {{ email['date'] }} |
27 |
28 | {% endfor %}
29 |
30 |
31 | {% if nav_offsets and nav_path -%}
32 | {{ common.display_nav(nav_offsets, nav_path, query)|safe }}
33 | {% endif -%}
34 |
35 | {% endblock -%}
36 |
--------------------------------------------------------------------------------
/ch07/mongo/sent_distribution_fix_mongo.js:
--------------------------------------------------------------------------------
1 | function range(start, stop, step){
2 | if (typeof stop=='undefined'){
3 | // one param defined
4 | stop = start;
5 | start = 0;
6 | };
7 | if (typeof step=='undefined'){
8 | step = 1;
9 | };
10 | if ((step>0 && start>=stop) || (step<0 && start<=stop)){
11 | return [];
12 | };
13 | var result = [];
14 | for (var i=start; step>0 ? istop; i+=step){
15 | result.push(i);
16 | };
17 | return result;
18 | };
19 |
20 | // Get "00" - "23"
21 | function makeHourRange(num) {
22 | return num < 10 ? "0" + num.toString() : num.toString();
23 | }
24 |
25 | function fillBlanks(rawData) {
26 | var hourRange = range(0,24);
27 | var ourData = Array();
28 | for (hour in hourRange)
29 | {
30 | var hourString = makeHourRange(hour);
31 | var found = false;
32 | for(x in rawData)
33 | {
34 | if(rawData[x]['sent_hour'] == hourString)
35 | {
36 | found = true;
37 | break;
38 | }
39 | }
40 | if(found == true)
41 | {
42 | ourData.push(rawData[x]);
43 | }
44 | else
45 | {
46 | ourData.push({'sent_hour': hourString, 'total': 0})
47 | }
48 | }
49 | return ourData;
50 | }
51 |
52 | use agile_data
53 | data = sent_dist.findOne();
54 | fillBlanks(data['sent_distribution']);
55 |
--------------------------------------------------------------------------------
/ch07/pig/lda.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 |
11 | REGISTER $HOME/varaha/lib/*.jar /* */
12 | REGISTER $HOME/varaha/target/varaha-1.0-SNAPSHOT.jar
13 |
14 | define TokenizeText varaha.text.TokenizeText();
15 | define LDATopics varaha.topic.LDATopics();
16 | define RangeConcat org.pygmalion.udf.RangeBasedStringConcat('0', ' ');
17 |
18 | set default_parallel 10
19 | set mapred.map.tasks.speculative.execution false
20 | set mapred.reduce.tasks.speculative.execution false
21 |
22 | --
23 | -- Load the docs
24 | --
25 | emails = load '/me/Data/test_mbox' using AvroStorage();
26 | raw_documents = foreach emails generate message_id, body;
27 | --
28 | -- Tokenize text to remove stopwords
29 | --
30 | tokenized = foreach raw_documents generate message_id, flatten(TokenizeText(body)) as (token:chararray);
31 |
32 | --
33 | -- Concat the text for a given doc with spaces
34 | --
35 | documents = foreach (group tokenized by message_id) generate group as message_id, RangeConcat(tokenized.token) as text;
36 |
37 | --
38 | -- Ensure all our documents are sane
39 | --
40 | for_lda = filter documents by message_id IS NOT NULL and text IS NOT NULL;
41 |
42 | --
43 | -- Group the docs by all and find topics
44 | --
45 | -- WARNING: This is, in general, not appropriate in a production environment.
46 | -- Instead it is best to group by some piece of metadata which partitions
47 | -- the documents into smaller groups.
48 | --
49 | topics = foreach (group for_lda all) generate
50 | FLATTEN(LDATopics(20, for_lda)) as (
51 | topic_num:int,
52 | keywords:bag {t:tuple(keyword:chararray, weight:int)}
53 | );
54 |
55 |
56 | store topics into '/tmp/lda_topics.txt';
--------------------------------------------------------------------------------
/ch07/pig/ntfidf.macro:
--------------------------------------------------------------------------------
1 | /* Derived from TF-IDF by Jacob Perkins at http://thedatachef.blogspot.com/2011/04/tf-idf-with-apache-pig.html with
2 | help from Mat Kelcey who referred me to http://nlp.stanford.edu/IR-book/html/htmledition/maximum-tf-normalization-1.html */
3 |
4 | DEFINE ntf_idf(token_records, id_field, token_field) RETURNS out_relation {
5 |
6 | /* Calculate the term count per document */
7 | doc_word_totals = foreach (group $token_records by ($id_field, $token_field)) generate
8 | FLATTEN(group) as ($id_field, token),
9 | COUNT_STAR($token_records) as doc_total;
10 |
11 | /* Calculate the document size */
12 | pre_term_counts = foreach (group doc_word_totals by $id_field) generate
13 | group AS $id_field,
14 | FLATTEN(doc_word_totals.(token, doc_total)) as (token, doc_total),
15 | SUM(doc_word_totals.doc_total) as doc_size,
16 | MAX(doc_word_totals.doc_total) as max_freq;
17 |
18 | /* Calculate the TF */
19 | term_freqs = foreach pre_term_counts generate
20 | $id_field as $id_field,
21 | token as token,
22 | ((double)doc_total / (double)doc_size / (double) max_freq) AS term_freq;
23 |
24 | /* Get count of documents using each token, for idf */
25 | token_usages = foreach (group term_freqs by token) generate
26 | FLATTEN(term_freqs) as ($id_field:chararray, token:chararray, term_freq:double),
27 | COUNT_STAR(term_freqs) as num_docs_with_token;
28 |
29 | /* Get document count */
30 | just_ids = foreach $token_records generate $id_field;
31 | just_ids = DISTINCT just_ids;
32 | ndocs = foreach (group just_ids all) generate COUNT_STAR(just_ids) as total_docs;
33 |
34 | /* Note the use of Pig Scalars to calculate idf */
35 | scores = foreach token_usages {
36 | idf = LOG((double)ndocs.total_docs/(double)num_docs_with_token);
37 | ntf_idf = (double)term_freq * idf;
38 | generate $id_field as $id_field,
39 | token as token,
40 | (double)ntf_idf as score:double;
41 | };
42 |
43 | $out_relation = filter scores by token IS NOT NULL and token != '' and LENGTH(token) > 2; -- score > 0.10 and
44 | };
45 |
--------------------------------------------------------------------------------
/ch07/pig/process_mcl.pig:
--------------------------------------------------------------------------------
1 | sent_counts = load '/tmp/sent_counts.tsv' as (from:chararray, to:chararray, weight:int);
2 |
--------------------------------------------------------------------------------
/ch07/pig/publish_topics_per_email.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* MongoDB libraries and configuration */
5 | REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
6 | REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
7 | REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
8 |
9 | DEFINE MongoStorage com.mongodb.hadoop.pig.MongoStorage();
10 |
11 | per_document_scores = LOAD '/tmp/topics_per_document.txt' AS (message_id:chararray, topics:bag{topic:tuple(word:chararray, score:double)});
12 | store per_document_scores into 'mongodb://localhost/agile_data.topics_per_email' using MongoStorage();
13 |
--------------------------------------------------------------------------------
/ch07/pig/test_tokenizers.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 |
11 | REGISTER $HOME/varaha/lib/*.jar /* */
12 | REGISTER $HOME/varaha/target/varaha-1.0-SNAPSHOT.jar
13 |
14 | DEFINE TokenizeText varaha.text.TokenizeText();
15 | DEFINE StanfordTokenize varaha.text.StanfordTokenize();
16 |
17 | rmf /tmp/test_lucene.txt
18 | rmf /tmp/test_stanford.txt
19 |
20 | set default_parallel 5
21 | set mapred.map.tasks.speculative.execution false
22 | set mapred.reduce.tasks.speculative.execution false
23 |
24 | emails = load '/me/Data/test_mbox' using AvroStorage();
25 | emails = limit emails 10;
26 | id_body = foreach emails generate message_id, body;
27 |
28 | token_records = foreach id_body generate message_id, FLATTEN(TokenizeText(body)) as tokens;
29 | token_records_2 = foreach id_body generate message_id, FLATTEN(StanfordTokenize(body)) as tokens;
30 | store token_records into '/tmp/test_lucene.txt';
31 | store token_records_2 into '/tmp/test_stanford.txt';
--------------------------------------------------------------------------------
/ch07/pig/topics.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.7.4.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 | DEFINE LENGTH org.apache.pig.piggybank.evaluation.string.LENGTH();
11 |
12 | REGISTER $HOME/varaha/lib/*.jar /* Varaha has a good tokenizer */
13 | REGISTER $HOME/varaha/target/varaha-1.0-SNAPSHOT.jar
14 |
15 | DEFINE TokenizeText varaha.text.TokenizeText('1', '1');
16 |
17 | set default_parallel 20
18 |
19 | rmf /tmp/tf_idf_scores.txt
20 | rmf /tmp/ntf_idf_scores.txt
21 | rmf /tmp/trimmed_tokens.txt
22 |
23 | register 'udfs.py' using jython as funcs;
24 | import 'ntfidf.macro';
25 |
26 | /* Load emails and trim unneeded fields */
27 | emails = load '/me/Data/test_mbox' using AvroStorage();
28 | -- emails = FILTER emails BY body IS NOT NULL;
29 | id_body_address = foreach emails generate message_id, body, from.address as address;
30 |
31 | /* Project and latten to message_id/address/token and basic filter */
32 | token_records_address = foreach id_body_address generate message_id, address, FLATTEN(TokenizeText(body)) as token;
33 | trimmed_tokens = filter token_records_address by token is not null and token != '' and LENGTH(token) > 2;
34 | store trimmed_tokens into '/tmp/trimmed_tokens.txt';
35 |
36 | /* Run topics per message */
37 | ntf_idf_scores_per_message = ntf_idf(trimmed_tokens, 'message_id', 'token');
38 | store ntf_idf_scores_per_message into '/tmp/ntf_idf_scores_per_message.txt';
39 |
40 | /* Run topics per email address */
41 | ntf_idf_scores_per_address = ntf_idf(trimmed_tokens, 'address', 'token');
42 | store ntf_idf_scores_per_address into '/tmp/ntf_idf_scores_per_address.txt';
43 |
--------------------------------------------------------------------------------
/ch07/pig/udfs.py:
--------------------------------------------------------------------------------
1 | @outputSchema("sent_dist:bag{t:(sent_hour:chararray, total:int)}")
2 | def fill_in_blanks(sent_dist):
3 | print sent_dist
4 | out_data = list()
5 | hours = [ '%02d' % i for i in range(24) ]
6 | for hour in hours:
7 | entry = [x for x in sent_dist if x[0] == hour]
8 | if entry:
9 | entry = entry[0]
10 | print entry.__class__
11 | out_data.append(tuple([entry[0], entry[1]]))
12 | else:
13 | out_data.append(tuple([hour, 0]))
14 | return out_data
15 |
16 | @outputSchema("token:chararray")
17 | def lower(token):
18 | return token.lower()
19 |
20 | import re, sys
21 |
22 | @outputSchema("token:chararray")
23 | def remove_punctuation(token):
24 | #word = re.sub(r'([^\w\s]|_)+(?=\s|$)', '', token)
25 | #punctuation = re.compile(r'[-.@&$#`\'?!,>\\":;()|]')
26 | #words = list()
27 | #word = punctuation.sub('', token, count=sys.maxint)
28 | return token
29 |
30 | import operator
31 |
32 | def _dotProduct(vector1, vector2):
33 | dotProduct = 0
34 | for i in range(0, len(vector1)):
35 | p = 0
36 | if vector1[i][0] != None:
37 | p = vector1[i][0]
38 | q = 0
39 | if vector2[i][0] != None:
40 | q = vector2[i][0]
41 | dotProduct += p * q
42 | return dotProduct
43 |
44 | @outputSchema("t:tuple(topic1:chararray, topic2:chararray, cosine_similarity:double)")
45 | def cosineSimilarity(topic1, vector1, topic2, vector2):
46 | numerator = _dotProduct(vector1, vector2)
47 | denominator = _dotProduct(vector1, vector1) * _dotProduct(vector2, vector2)
48 | result = numerator / denominator
49 | outTuple = (topic1, topic2, result)
50 | return outTuple
--------------------------------------------------------------------------------
/ch07/python/sent_distribution_fix.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 |
3 | def fill_in_blanks(in_data):
4 | out_data = list()
5 | hours = [ '%02d' % i for i in range(24) ]
6 | for hour in hours:
7 | entry = [x for x in in_data if x['sent_hour'] == hour]
8 | if entry:
9 | out_data.append(entry[0])
10 | else:
11 | out_data.append({'sent_hour': hour, 'total': 0})
12 | return out_data
13 |
14 | def address(email_address):
15 | chart_json = json.dumps(fill_in_blanks(sent_dist['sent_dist']))
16 |
17 | # Setup Mongo
18 | conn = pymongo.Connection() # defaults to localhost
19 | db = conn.agile_data
20 | sent_dist = db['sent_distributions']
21 |
22 | record = sent_dist.find_one()
23 |
--------------------------------------------------------------------------------
/ch07/python/token_extractor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | from collections import defaultdict
4 | import sys, re
5 | import nltk
6 | import json
7 | import operator
8 |
9 | def log(message):
10 | try:
11 | sys.stderr.write(__file__ + ": " + message + "\n")
12 | except:
13 | sys.stderr.write(message + "\n")
14 |
15 | class TokenExtractor:
16 |
17 | def lower(self, token):
18 | return token.lower()
19 |
20 | def remove_punctuation(self, token):
21 | punctuation = re.compile(r'[-.@&$#`\'?!,>\\":;()|]')
22 | words = list()
23 | word = punctuation.sub("", token)
24 | if word != "":
25 | return word
26 |
27 | def short_filter(self, token):
28 | if len(token) > 2:
29 | words.append(token)
30 |
31 | def main():
32 | te = TokenExtractor()
33 | for line in sys.stdin:
34 | message_id, token = line.split('\t')
35 | lowers = te.lower(token)
36 | no_punc = te.remove_punctuation(lowers)
37 | no_shorts = te.short_filter(no_punc)
38 | print message_id + "\t" + no_shorts
39 |
40 | if __name__ == "__main__":
41 | main()
42 |
--------------------------------------------------------------------------------
/ch07/web/config.py:
--------------------------------------------------------------------------------
1 | EMAILS_PER_LIST_PAGE=15
2 | EMAILS_PER_ADDRESS_PAGE=6
3 | ELASTIC_URL='http://localhost:9200/inbox'
4 |
--------------------------------------------------------------------------------
/ch07/web/static/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch07/web/static/bootstrap/img/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/ch07/web/static/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch07/web/static/bootstrap/img/glyphicons-halflings.png
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Jekyll Files #
3 | ################
4 | _site
5 |
6 |
7 | # Random Files #
8 | ################
9 | *.swp
10 | *~
11 | *.log
12 |
13 |
14 | # Private Test Data #
15 | #####################
16 | *REALDATA*
17 |
18 |
19 | # OS generated files #
20 | ######################
21 | .DS_Store*
22 | ehthumbs.db
23 | Icon?
24 | Thumbs.db
25 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/LICENSE.md:
--------------------------------------------------------------------------------
1 |
2 | ##nvd3.js License
3 |
4 | Copyright (c) 2011, 2012 [Novus Partners, Inc.][novus]
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 |
18 | [novus]: https://www.novus.com/
19 |
20 |
21 |
22 | ##d3.js License
23 |
24 | Copyright (c) 2012, Michael Bostock
25 | All rights reserved.
26 |
27 | Redistribution and use in source and binary forms, with or without
28 | modification, are permitted provided that the following conditions are met:
29 |
30 | * Redistributions of source code must retain the above copyright notice, this
31 | list of conditions and the following disclaimer.
32 |
33 | * Redistributions in binary form must reproduce the above copyright notice,
34 | this list of conditions and the following disclaimer in the documentation
35 | and/or other materials provided with the distribution.
36 |
37 | * The name Michael Bostock may not be used to endorse or promote products
38 | derived from this software without specific prior written permission.
39 |
40 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
41 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
43 | DISCLAIMED. IN NO EVENT SHALL MICHAEL BOSTOCK BE LIABLE FOR ANY DIRECT,
44 | INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
45 | BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
46 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
47 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
49 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/Makefile:
--------------------------------------------------------------------------------
1 | JS_FILES = \
2 | src/intro.js \
3 | src/core.js \
4 | src/tooltip.js \
5 | src/utils.js \
6 | src/models/axis.js \
7 | src/models/historicalBar.js \
8 | src/models/bullet.js \
9 | src/models/bulletChart.js \
10 | src/models/cumulativeLineChart.js \
11 | src/models/discreteBar.js \
12 | src/models/discreteBarChart.js \
13 | src/models/distribution.js \
14 | src/models/indentedTree.js \
15 | src/models/legend.js \
16 | src/models/line.js \
17 | src/models/lineChart.js \
18 | src/models/linePlusBarChart.js \
19 | src/models/lineWithFocusChart.js \
20 | src/models/multiBar.js \
21 | src/models/multiBarChart.js \
22 | src/models/multiBarHorizontal.js \
23 | src/models/multiBarHorizontalChart.js \
24 | src/models/multiChart.js \
25 | src/models/ohlcBar.js \
26 | src/models/pie.js \
27 | src/models/pieChart.js \
28 | src/models/scatter.js \
29 | src/models/scatterChart.js \
30 | src/models/scatterPlusLineChart.js \
31 | src/models/sparkline.js \
32 | src/models/sparklinePlus.js \
33 | src/models/stackedArea.js \
34 | src/models/stackedAreaChart.js \
35 | src/outro.js
36 |
37 | JS_COMPILER = \
38 | uglifyjs
39 |
40 | all: nv.d3.js nv.d3.min.js
41 | nv.d3.js: $(JS_FILES)
42 | nv.d3.min.js: $(JS_FILES)
43 |
44 | nv.d3.js: Makefile
45 | rm -f $@
46 | cat $(filter %.js,$^) >> $@
47 |
48 | %.min.js:: Makefile
49 | rm -f $@
50 | cat $(filter %.js,$^) | $(JS_COMPILER) >> $@
51 |
52 | clean:
53 | rm -rf nv.d3.js nv.d3.min.js
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/README.md:
--------------------------------------------------------------------------------
1 | Please see Novus' official statement on nvd3 with an explanation,
2 | apology, and commitment to its permanent status as an open-source
3 | project.
4 | [http://nvd3.org/statement.html](http://nvd3.org/statement.html)
5 |
6 | # nvd3 - v0.0.1
7 |
8 | A reusable chart library for d3.JS.
9 |
10 | Currently in an early stage of development, but will be a very active project. It may change quite a bit from its current state, but will always try to follow the style in which d3.js was done.
11 |
12 | You can also check out the [examples page](http://nvd3.org/ghpages/examples.html)
13 |
14 | ---
15 |
16 | If one of [the existing models](https://github.com/novus/nvd3/tree/master/src/models) doesn't meet your needs, fork the project, implement the model and an example using it, send us a pull request, for consideration for inclusion in the project.
17 |
18 | ---
19 |
20 | Minifying your fork:
21 |
22 | The Makefile requires [UglifyJS](https://github.com/mishoo/UglifyJS).
23 |
24 | The easist way to install is to install via npm. Run `npm install
25 | uglify-js` from your home directory, then add the output from `npm bin`
26 | into your path so that you have access to `uglifyjs` from the command
27 | line (remember to restart your terminal window when adding to the path.)
28 |
29 | Once you have `uglifyjs` command available, running `make` from your
30 | fork's root directory will rebuild both `nv.d3.js` and `nv.d3.min.js`.
31 |
32 | Without UglifyJS, you won't get the minified version when running make.
33 |
34 | **We ask that you DO NOT minify pull requests...
35 | If you need to minify please build pull request in separate branch, and
36 | merge and minify in yout master.
37 |
38 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | copy src\intro.js /B + src\core.js /B + src\tooltip.js /B temp1.js /B
3 | copy src\models\*.js /B temp2.js /B
4 | copy temp1.js /B + temp2.js /B + src\outro.js /B nv.d3.js /B
5 | del temp1.js
6 | del temp2.js
7 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | COMPRESSOR=`which yui-compressor`
3 | cat src/intro.js src/core.js src/tooltip.js src/utils.js src/models/*.js src/outro.js > nv.d3.js
4 | if [ -e $COMPRESSOR ]; then
5 | $COMPRESSOR --type js -o nv.d3.min.js nv.d3.js
6 | fi
7 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/deprecated/lineChart-old.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
84 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/images/grey-minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch07/web/static/nvd3/examples/images/grey-minus.png
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/images/grey-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch07/web/static/nvd3/examples/images/grey-plus.png
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/legend.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
76 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/line.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
96 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/lineWithFocusChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
88 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/multiBar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
93 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/multiBarChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
81 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/pie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
94 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/sparkline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
21 |
22 |
23 | Sparkline:
24 |
25 |
26 |
27 |
28 |
29 |
63 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/sparklinePlus.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
20 | SparklinePlus:
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
68 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/examples/stream_layers.js:
--------------------------------------------------------------------------------
1 |
2 | /* Inspired by Lee Byron's test data generator. */
3 | function stream_layers(n, m, o) {
4 | if (arguments.length < 3) o = 0;
5 | function bump(a) {
6 | var x = 1 / (.1 + Math.random()),
7 | y = 2 * Math.random() - .5,
8 | z = 10 / (.1 + Math.random());
9 | for (var i = 0; i < m; i++) {
10 | var w = (i / m - y) * z;
11 | a[i] += x * Math.exp(-w * w);
12 | }
13 | }
14 | return d3.range(n).map(function() {
15 | var a = [], i;
16 | for (i = 0; i < m; i++) a[i] = o + o * Math.random();
17 | for (i = 0; i < 5; i++) bump(a);
18 | return a.map(stream_index);
19 | });
20 | }
21 |
22 | /* Another layer generator using gamma distributions. */
23 | function stream_waves(n, m) {
24 | return d3.range(n).map(function(i) {
25 | return d3.range(m).map(function(j) {
26 | var x = 20 * j / m - i / 3;
27 | return 2 * x * Math.exp(-.5 * x);
28 | }).map(stream_index);
29 | });
30 | }
31 |
32 | function stream_index(d, i) {
33 | return {x: i, y: Math.max(0, d)};
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/src/intro.js:
--------------------------------------------------------------------------------
1 | (function(){
2 |
--------------------------------------------------------------------------------
/ch07/web/static/nvd3/src/outro.js:
--------------------------------------------------------------------------------
1 | })();
--------------------------------------------------------------------------------
/ch07/web/templates/partials/emails.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 |
9 | {% block content -%}
10 | Emails
11 |
14 |
15 |
16 |
17 | From |
18 | Subject |
19 | Date |
20 |
21 |
22 | {% for email in emails %}
23 |
24 | {{ common.display_email_address(email['from'])|safe }} |
25 | {{ common.display_link(email['message_id'], '/email', email['subject'])|safe }} |
26 | {{ email['date'] }} |
27 |
28 | {% endfor %}
29 |
30 |
31 | {% if nav_offsets and nav_path -%}
32 | {{ common.display_nav(nav_offsets, nav_path, query)|safe }}
33 | {% endif -%}
34 |
35 | {% endblock -%}
36 |
--------------------------------------------------------------------------------
/ch08/README.md:
--------------------------------------------------------------------------------
1 | Agile Data the Book
2 | ===================
3 |
4 | You can buy the book [here](http://shop.oreilly.com/product/0636920025054.do). You can read the book on [O'Reilly OFPS](http://ofps.oreilly.com/titles/9781449326265/) now. Work the chapter code examples as you go. Don't forget to initialize your python environment. Try linux (apt-get, yum) or OS X (brew, port) packages if any of the requirements don't install in your [virtualenv](http://www.virtualenv.org/en/latest/).
5 |
6 | Agile Data - Chapter 8: Making Predictions
7 | ===============================================================
8 |
9 | ## Setup Python Virtual Environment ##
10 |
11 | ```
12 | # From project root
13 |
14 | # Setup python virtualenv
15 | virtualenv -p `which python2.7` venv --distribute
16 | source venv/bin/activate
17 | pip install -r requirements.txt
18 | ```
19 |
20 | ## Run Analytic Inbox Application ##
21 |
22 | Most of this chapter will involve running our Python/Flask web application.
23 |
24 | ```
25 | python web/index.py
26 | ```
27 |
28 | ## Smooth Email Sent Time Distributions ##
29 |
30 | See previous - start the web app, the fix is applied to 'web/index.py'.
31 |
32 | ## Calculate Reply Probability ##
33 |
34 | To calculate, run:
35 |
36 | ```
37 | pig -l /tmp -x local -v -w p_reply.pig
38 | ```
39 |
40 | This will create a mongodb store: 'mongodb://localhost/agile_data.related_addresses'
41 |
42 | ## Check MongoDB for P(reply|from) and P(reply|to) ##
43 |
44 | Run 'mongo.js', or in the mongo terminal:
45 |
46 | ```
47 | mongo agile_data
48 | db.reply_ratios.ensureIndex({from: 1, to: 1});
49 | db.reply_ratios.findOne();
50 | ```
51 |
52 |
--------------------------------------------------------------------------------
/ch08/web/config.py:
--------------------------------------------------------------------------------
1 | EMAILS_PER_LIST_PAGE=15
2 | EMAILS_PER_ADDRESS_PAGE=6
3 | ELASTIC_URL='http://localhost:9200/inbox'
4 | MY_EMAIL='russell.jurney@gmail.com'
5 |
--------------------------------------------------------------------------------
/ch08/web/smoother.py:
--------------------------------------------------------------------------------
1 | # Based on http://www.scipy.org/Cookbook/SignalSmooth
2 |
3 | import numpy as np
4 |
5 | class Smoother():
6 |
7 | """Takes an array of objects as input, and the data key of the object for access."""
8 | def __init__(self, raw_data, data_key):
9 | self.raw_data = raw_data
10 | print self.raw_data
11 | self.data = self.to_array(raw_data, data_key)
12 |
13 | """Given an array of objects with values, return a numpy array of values."""
14 | def to_array(self, in_data, data_key):
15 | data_array = list()
16 | for datum in in_data:
17 | data_array.append(datum[data_key])
18 | return np.array(data_array)
19 |
20 | """Smoothing method from SciPy SignalSmooth Cookbook: http://www.scipy.org/Cookbook/SignalSmooth"""
21 | def smooth(self, window_len=5, window='hamming'):
22 | x = self.data
23 | s=np.r_[2*x[0]-x[window_len:1:-1], x, 2*x[-1]-x[-1:-window_len:-1]]
24 | w = getattr(np, window)(window_len)
25 | y = np.convolve(w/w.sum(), s, mode='same')
26 | self.smoothed = y[window_len-1:-window_len+1]
27 |
28 | def to_objects(self):
29 | objects = list()
30 | hours = [ '%02d' % i for i in range(24) ]
31 | for idx, val in enumerate(hours):
32 | objects.append({"sent_hour": val, "total": round(self.smoothed[idx], 0)})
33 | return objects
34 |
--------------------------------------------------------------------------------
/ch08/web/static/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch08/web/static/bootstrap/img/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/ch08/web/static/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch08/web/static/bootstrap/img/glyphicons-halflings.png
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Jekyll Files #
3 | ################
4 | _site
5 |
6 |
7 | # Random Files #
8 | ################
9 | *.swp
10 | *~
11 | *.log
12 |
13 |
14 | # Private Test Data #
15 | #####################
16 | *REALDATA*
17 |
18 |
19 | # OS generated files #
20 | ######################
21 | .DS_Store*
22 | ehthumbs.db
23 | Icon?
24 | Thumbs.db
25 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/Makefile:
--------------------------------------------------------------------------------
1 | JS_FILES = \
2 | src/intro.js \
3 | src/core.js \
4 | src/tooltip.js \
5 | src/utils.js \
6 | src/models/axis.js \
7 | src/models/historicalBar.js \
8 | src/models/bullet.js \
9 | src/models/bulletChart.js \
10 | src/models/cumulativeLineChart.js \
11 | src/models/discreteBar.js \
12 | src/models/discreteBarChart.js \
13 | src/models/distribution.js \
14 | src/models/indentedTree.js \
15 | src/models/legend.js \
16 | src/models/line.js \
17 | src/models/lineChart.js \
18 | src/models/linePlusBarChart.js \
19 | src/models/lineWithFocusChart.js \
20 | src/models/multiBar.js \
21 | src/models/multiBarChart.js \
22 | src/models/multiBarHorizontal.js \
23 | src/models/multiBarHorizontalChart.js \
24 | src/models/multiChart.js \
25 | src/models/ohlcBar.js \
26 | src/models/pie.js \
27 | src/models/pieChart.js \
28 | src/models/scatter.js \
29 | src/models/scatterChart.js \
30 | src/models/scatterPlusLineChart.js \
31 | src/models/sparkline.js \
32 | src/models/sparklinePlus.js \
33 | src/models/stackedArea.js \
34 | src/models/stackedAreaChart.js \
35 | src/outro.js
36 |
37 | JS_COMPILER = \
38 | uglifyjs
39 |
40 | all: nv.d3.js nv.d3.min.js
41 | nv.d3.js: $(JS_FILES)
42 | nv.d3.min.js: $(JS_FILES)
43 |
44 | nv.d3.js: Makefile
45 | rm -f $@
46 | cat $(filter %.js,$^) >> $@
47 |
48 | %.min.js:: Makefile
49 | rm -f $@
50 | cat $(filter %.js,$^) | $(JS_COMPILER) >> $@
51 |
52 | clean:
53 | rm -rf nv.d3.js nv.d3.min.js
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/README.md:
--------------------------------------------------------------------------------
1 | Please see Novus' official statement on nvd3 with an explanation,
2 | apology, and commitment to its permanent status as an open-source
3 | project.
4 | [http://nvd3.org/statement.html](http://nvd3.org/statement.html)
5 |
6 | # nvd3 - v0.0.1
7 |
8 | A reusable chart library for d3.JS.
9 |
10 | Currently in an early stage of development, but will be a very active project. It may change quite a bit from its current state, but will always try to follow the style in which d3.js was done.
11 |
12 | You can also check out the [examples page](http://nvd3.org/ghpages/examples.html)
13 |
14 | ---
15 |
16 | If one of [the existing models](https://github.com/novus/nvd3/tree/master/src/models) doesn't meet your needs, fork the project, implement the model and an example using it, send us a pull request, for consideration for inclusion in the project.
17 |
18 | ---
19 |
20 | Minifying your fork:
21 |
22 | The Makefile requires [UglifyJS](https://github.com/mishoo/UglifyJS).
23 |
24 | The easist way to install is to install via npm. Run `npm install
25 | uglify-js` from your home directory, then add the output from `npm bin`
26 | into your path so that you have access to `uglifyjs` from the command
27 | line (remember to restart your terminal window when adding to the path.)
28 |
29 | Once you have `uglifyjs` command available, running `make` from your
30 | fork's root directory will rebuild both `nv.d3.js` and `nv.d3.min.js`.
31 |
32 | Without UglifyJS, you won't get the minified version when running make.
33 |
34 | **We ask that you DO NOT minify pull requests...
35 | If you need to minify please build pull request in separate branch, and
36 | merge and minify in yout master.
37 |
38 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | copy src\intro.js /B + src\core.js /B + src\tooltip.js /B temp1.js /B
3 | copy src\models\*.js /B temp2.js /B
4 | copy temp1.js /B + temp2.js /B + src\outro.js /B nv.d3.js /B
5 | del temp1.js
6 | del temp2.js
7 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | COMPRESSOR=`which yui-compressor`
3 | cat src/intro.js src/core.js src/tooltip.js src/utils.js src/models/*.js src/outro.js > nv.d3.js
4 | if [ -e $COMPRESSOR ]; then
5 | $COMPRESSOR --type js -o nv.d3.min.js nv.d3.js
6 | fi
7 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/deprecated/lineChart-old.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
84 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/images/grey-minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch08/web/static/nvd3/examples/images/grey-minus.png
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/images/grey-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch08/web/static/nvd3/examples/images/grey-plus.png
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/legend.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
76 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/line.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
96 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/lineWithFocusChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
88 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/multiBar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
93 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/multiBarChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
81 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/pie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
94 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/sparkline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
21 |
22 |
23 | Sparkline:
24 |
25 |
26 |
27 |
28 |
29 |
63 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/sparklinePlus.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
20 | SparklinePlus:
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
68 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/examples/stream_layers.js:
--------------------------------------------------------------------------------
1 |
2 | /* Inspired by Lee Byron's test data generator. */
3 | function stream_layers(n, m, o) {
4 | if (arguments.length < 3) o = 0;
5 | function bump(a) {
6 | var x = 1 / (.1 + Math.random()),
7 | y = 2 * Math.random() - .5,
8 | z = 10 / (.1 + Math.random());
9 | for (var i = 0; i < m; i++) {
10 | var w = (i / m - y) * z;
11 | a[i] += x * Math.exp(-w * w);
12 | }
13 | }
14 | return d3.range(n).map(function() {
15 | var a = [], i;
16 | for (i = 0; i < m; i++) a[i] = o + o * Math.random();
17 | for (i = 0; i < 5; i++) bump(a);
18 | return a.map(stream_index);
19 | });
20 | }
21 |
22 | /* Another layer generator using gamma distributions. */
23 | function stream_waves(n, m) {
24 | return d3.range(n).map(function(i) {
25 | return d3.range(m).map(function(j) {
26 | var x = 20 * j / m - i / 3;
27 | return 2 * x * Math.exp(-.5 * x);
28 | }).map(stream_index);
29 | });
30 | }
31 |
32 | function stream_index(d, i) {
33 | return {x: i, y: Math.max(0, d)};
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/src/intro.js:
--------------------------------------------------------------------------------
1 | (function(){
2 |
--------------------------------------------------------------------------------
/ch08/web/static/nvd3/src/outro.js:
--------------------------------------------------------------------------------
1 | })();
--------------------------------------------------------------------------------
/ch08/web/templates/partials/emails.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 |
9 | {% block content -%}
10 | Emails
11 |
14 |
15 |
16 |
17 | From |
18 | Subject |
19 | Date |
20 |
21 |
22 | {% for email in emails %}
23 |
24 | {{ common.display_email_address(email['from'])|safe }} |
25 | {{ common.display_link(email['message_id'], '/email', email['subject'])|safe }} |
26 | {{ email['date'] }} |
27 |
28 | {% endfor %}
29 |
30 |
31 | {% if nav_offsets and nav_path -%}
32 | {{ common.display_nav(nav_offsets, nav_path, query)|safe }}
33 | {% endif -%}
34 |
35 | {% endblock -%}
36 |
--------------------------------------------------------------------------------
/ch09/mongo.js:
--------------------------------------------------------------------------------
1 | // Drop all relations, to recreate
2 | db.overall_reply_ratio.drop();
3 | db.from_to_reply_ratios.drop();
4 | db.p_sent_from_to.drop();
5 | db.hourly_from_reply_probs.drop();
6 | db.p_sent_hour.drop();
7 | db.token_reply_rates.drop();
8 |
9 | db.p_token.ensureIndex({'token': 1})
10 | db.token_reply_rates.ensureIndex({'token': 1})
11 | db.token_no_reply_rates.ensureIndex({'token': 1})
12 | db.from_to_reply_ratios.ensureIndex({from: 1, to: 1})
13 | db.from_to_no_reply_ratios.ensureIndex({from: 1, to: 1})
14 |
15 | // {
16 | // "_id" : ObjectId("511700c330048b60597e7c04"),
17 | // "token" : "public",
18 | // "reply_rate" : 0.6969366812896153
19 | // }
--------------------------------------------------------------------------------
/ch09/pig/hamming.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Based on http://www.scipy.org/Cookbook/SignalSmooth
3 |
4 | import numpy as np
5 | import sys, os
6 |
7 | def smooth(data, window_len=5, window='hamming'):
8 | x = data
9 | s=np.r_[2*x[0]-x[window_len:1:-1], x, 2*x[-1]-x[-1:-window_len:-1]]
10 | w = getattr(np, window)(window_len)
11 | y = np.convolve(w/w.sum(), s, mode='same')
12 | return y[window_len-1:-window_len+1]
13 |
14 | def main():
15 | for line in sys.stdin:
16 | email, hour_dist = line.split('\t')
17 | vals = hour_dist[2:-3].rsplit('),(')
18 | data = []
19 | for val in vals:
20 | hour, p_reply = val.rsplit(',')
21 | data.append(float(p_reply))
22 | smoothed = smooth(np.array(data)).flatten()
23 | for i in range(0,len(smoothed)):
24 | hour = vals[i].rsplit(',')[0]
25 | print email + "\t" + hour + "\t" + str(smoothed[i])
26 |
27 | if __name__ == "__main__":
28 | main()
29 |
--------------------------------------------------------------------------------
/ch09/pig/publish_topics.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* MongoDB libraries and configuration */
5 | REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
6 | REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
7 | REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
8 |
9 | DEFINE MongoStorage com.mongodb.hadoop.pig.MongoStorage();
10 |
11 | token_reply_rates = LOAD '/tmp/reply_rates.txt' AS (token:chararray, reply_rate:double);
12 | store token_reply_rates into 'mongodb://localhost/agile_data.token_reply_rates' using MongoStorage();
13 |
14 | token_no_reply_rates = LOAD '/tmp/no_reply_rates.txt' AS (token:chararray, reply_rate:double);
15 | store token_no_reply_rates into 'mongodb://localhost/agile_data.token_no_reply_rates' using MongoStorage();
16 |
17 | p_token = LOAD '/tmp/p_token.txt' AS (token:chararray, prob:double);
18 | store p_token into 'mongodb://localhost/agile_data.p_token' using MongoStorage();
19 |
--------------------------------------------------------------------------------
/ch09/pig/smooth_times.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 |
11 | /* MongoDB libraries and configuration */
12 | REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
13 | REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
14 | REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
15 |
16 | DEFINE MongoStorage com.mongodb.hadoop.pig.MongoStorage();
17 |
18 | set default_parallel 10
19 | set mapred.map.tasks.speculative.execution false
20 | set mapred.reduce.tasks.speculative.execution false
21 |
22 | rmf /tmp/smoothed_sent_dists.avro
23 | rmf /tmp/smoothed_sent_dists.txt
24 |
25 | time_dists_per_email = LOAD '/tmp/date_filled_dist.avro' using AvroStorage(); -- as (address:chararray, sent_distribution:bag{t:tuple(hour:chararray, p_reply:double)});
26 |
27 | DEFINE smooth_stream `hamming.py` SHIP ('hamming.py');
28 | smoothed_time_dists_per_email = STREAM time_dists_per_email THROUGH smooth_stream as (address:chararray, hour:chararray, p_reply:double);
29 |
30 | answer = foreach (group smoothed_time_dists_per_email by address) {
31 | sorted = order smoothed_time_dists_per_email by hour;
32 | generate group as address, sorted.(hour, p_reply) as sent_distribution;
33 | };
34 | store answer into '/tmp/smoothed_sent_dists.avro' using AvroStorage();
35 | store answer into '/tmp/smoothed_sent_dists.txt';
36 | store answer into 'mongodb://localhost/agile_data.hourly_from_reply_probs' using MongoStorage();
37 |
38 | /*p_sent_hour = load '/tmp/p_sent_hour.txt' as (from:chararray, distribution:bag{t:tuple(sent_hour:chararray, ratio:double)});
39 | store p_sent_hour into 'mongodb://localhost/agile_data.p_sent_hour' using MongoStorage();
40 |
41 | */
--------------------------------------------------------------------------------
/ch09/pig/test_results.pig:
--------------------------------------------------------------------------------
1 | /* Set Home Directory - where we install software */
2 | %default HOME `echo \$HOME/Software/`
3 |
4 | /* Avro uses json-simple, and is in piggybank until Pig 0.12, where AvroStorage and TrevniStorage are builtins */
5 | REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
6 | REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
7 | REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar
8 |
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 | DEFINE ABS org.apache.pig.piggybank.evaluation.math.ABS();
11 |
12 | rmf /tmp/final_answer.txt
13 |
14 | results = load '../results.txt' as (message_id:chararray, p_tokens_weight:double, p_from_to_reply_weight:double, p_reply:double);
15 |
16 | emails = load '/me/Data/test_mbox' using AvroStorage();
17 | emails = foreach emails generate message_id, in_reply_to;
18 |
19 | with_results = join results by message_id left outer, emails by in_reply_to;
20 |
21 | test_results = foreach with_results generate (double)((emails::message_id is not null) ? 1 : 0) as result:double, *;
22 | errors = foreach test_results generate p_tokens_weight as p_tokens_weight,
23 | p_from_to_reply_weight as p_from_to_reply_weight,
24 | (double)ABS(result - p_reply) as error:double;
25 | answer = foreach (group errors by (p_tokens_weight, p_from_to_reply_weight)) generate flatten(group) as (p_tokens_weight, p_from_to_reply_weight),
26 | SUM(errors.error)/COUNT(errors.error) as avg_error;
27 | final_answer = order answer by avg_error desc;
28 | store final_answer into '/tmp/final_answer.txt';
29 |
--------------------------------------------------------------------------------
/ch09/tune_weights.py:
--------------------------------------------------------------------------------
1 | import pymongo
2 | from datetime import datetime
3 | from avro import schema, datafile, io
4 | import pprint
5 | import sys
6 | import json
7 | from nltk.tokenize import word_tokenize
8 |
9 | import dateutil.parser
10 |
11 | pp = pprint.PrettyPrinter()
12 |
13 | conn = pymongo.Connection() # defaults to localhost
14 | db = conn.agile_data
15 | from_to_reply_ratios = db['from_to_reply_ratios']
16 | hourly_from_reply_probs = db['hourly_from_reply_probs']
17 | token_reply_rates = db['token_reply_rates']
18 |
19 | # Test reading avros
20 | rec_reader = io.DatumReader()
21 | # Create a 'data file' (avro file) reader
22 | df_reader = datafile.DataFileReader(
23 | open("/me/Data/test_mbox/part-1.avro"),
24 | rec_reader
25 | )
26 |
27 | # Go through all the avro emails...
28 | for record in df_reader:
29 | # Get the message_id, from, first to, and message body
30 | message_id = record['message_id']
31 | froms = record['from']['address']
32 | if record['tos']:
33 | if record['tos'][0]:
34 | to = record['tos'][0]['address']
35 |
36 | # For each token in the body, if there's a match in MongoDB,
37 | # append it and average all of them at the end
38 | word_probs = []
39 | body = record['body']
40 | for token in word_tokenize(body):
41 | search = token_reply_rates.find_one({'token': token})
42 | if search:
43 | word_probs.append(search['reply_rate'])
44 | len_probs = float(len(probs))
45 | if(len_probs > 0):
46 | token_rate = sum(probs) / len_probs
47 | else:
48 | continue
49 |
50 | # Use from/to probabilities when available
51 | ftrr = from_to_reply_ratios.find_one({'from': froms, 'to': to})
52 | if ftrr:
53 | p_from_to_reply = ftrr['ratio']
54 | else:
55 | continue
56 |
57 | # Now try 0.1 increments of weights between these two vectors to weight them
58 | for i in [x / 10.0 for x in range(0, 11, 1)]:
59 | result = (token_rate * i) + (p_from_to_reply * (1 - i))
60 | print message_id + "\t" + str(i) + "\t" + str(1 - i) + "\t" + str(result)
61 |
62 | # Tada - followup with test_results.pig to find proper weight. Zoom in more as needed.
63 |
--------------------------------------------------------------------------------
/ch09/web/config.py:
--------------------------------------------------------------------------------
1 | EMAILS_PER_LIST_PAGE=15
2 | EMAILS_PER_ADDRESS_PAGE=6
3 | ELASTIC_URL='http://localhost:9200/inbox'
4 | MY_EMAIL='russell.jurney@gmail.com'
5 |
--------------------------------------------------------------------------------
/ch09/web/smoother.py:
--------------------------------------------------------------------------------
1 | # Based on http://www.scipy.org/Cookbook/SignalSmooth
2 |
3 | import numpy as np
4 |
5 | class Smoother():
6 |
7 | """Takes an array of objects as input, and the data key of the object for access."""
8 | def __init__(self, raw_data, data_key):
9 | self.raw_data = raw_data
10 | print self.raw_data
11 | self.data = self.to_array(raw_data, data_key)
12 |
13 | """Given an array of objects with values, return a numpy array of values."""
14 | def to_array(self, in_data, data_key):
15 | data_array = list()
16 | for datum in in_data:
17 | data_array.append(datum[data_key])
18 | return np.array(data_array)
19 |
20 | """Smoothing method from SciPy SignalSmooth Cookbook: http://www.scipy.org/Cookbook/SignalSmooth"""
21 | def smooth(self, window_len=5, window='blackman'):
22 | x = self.data
23 | s=np.r_[2*x[0]-x[window_len:1:-1], x, 2*x[-1]-x[-1:-window_len:-1]]
24 | w = getattr(np, window)(window_len)
25 | y = np.convolve(w/w.sum(), s, mode='same')
26 | self.smoothed = y[window_len-1:-window_len+1]
27 |
28 | def to_objects(self):
29 | objects = list()
30 | hours = [ '%02d' % i for i in range(24) ]
31 | for idx, val in enumerate(hours):
32 | objects.append({"sent_hour": val, "total": round(self.smoothed[idx], 0)})
33 | return objects
34 |
--------------------------------------------------------------------------------
/ch09/web/static/bootstrap/img/glyphicons-halflings-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch09/web/static/bootstrap/img/glyphicons-halflings-white.png
--------------------------------------------------------------------------------
/ch09/web/static/bootstrap/img/glyphicons-halflings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch09/web/static/bootstrap/img/glyphicons-halflings.png
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Jekyll Files #
3 | ################
4 | _site
5 |
6 |
7 | # Random Files #
8 | ################
9 | *.swp
10 | *~
11 | *.log
12 |
13 |
14 | # Private Test Data #
15 | #####################
16 | *REALDATA*
17 |
18 |
19 | # OS generated files #
20 | ######################
21 | .DS_Store*
22 | ehthumbs.db
23 | Icon?
24 | Thumbs.db
25 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/Makefile:
--------------------------------------------------------------------------------
1 | JS_FILES = \
2 | src/intro.js \
3 | src/core.js \
4 | src/tooltip.js \
5 | src/utils.js \
6 | src/models/axis.js \
7 | src/models/historicalBar.js \
8 | src/models/bullet.js \
9 | src/models/bulletChart.js \
10 | src/models/cumulativeLineChart.js \
11 | src/models/discreteBar.js \
12 | src/models/discreteBarChart.js \
13 | src/models/distribution.js \
14 | src/models/indentedTree.js \
15 | src/models/legend.js \
16 | src/models/line.js \
17 | src/models/lineChart.js \
18 | src/models/linePlusBarChart.js \
19 | src/models/lineWithFocusChart.js \
20 | src/models/multiBar.js \
21 | src/models/multiBarChart.js \
22 | src/models/multiBarHorizontal.js \
23 | src/models/multiBarHorizontalChart.js \
24 | src/models/multiChart.js \
25 | src/models/ohlcBar.js \
26 | src/models/pie.js \
27 | src/models/pieChart.js \
28 | src/models/scatter.js \
29 | src/models/scatterChart.js \
30 | src/models/scatterPlusLineChart.js \
31 | src/models/sparkline.js \
32 | src/models/sparklinePlus.js \
33 | src/models/stackedArea.js \
34 | src/models/stackedAreaChart.js \
35 | src/outro.js
36 |
37 | JS_COMPILER = \
38 | uglifyjs
39 |
40 | all: nv.d3.js nv.d3.min.js
41 | nv.d3.js: $(JS_FILES)
42 | nv.d3.min.js: $(JS_FILES)
43 |
44 | nv.d3.js: Makefile
45 | rm -f $@
46 | cat $(filter %.js,$^) >> $@
47 |
48 | %.min.js:: Makefile
49 | rm -f $@
50 | cat $(filter %.js,$^) | $(JS_COMPILER) >> $@
51 |
52 | clean:
53 | rm -rf nv.d3.js nv.d3.min.js
54 |
55 |
56 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/README.md:
--------------------------------------------------------------------------------
1 | Please see Novus' official statement on nvd3 with an explanation,
2 | apology, and commitment to its permanent status as an open-source
3 | project.
4 | [http://nvd3.org/statement.html](http://nvd3.org/statement.html)
5 |
6 | # nvd3 - v0.0.1
7 |
8 | A reusable chart library for d3.JS.
9 |
10 | Currently in an early stage of development, but will be a very active project. It may change quite a bit from its current state, but will always try to follow the style in which d3.js was done.
11 |
12 | You can also check out the [examples page](http://nvd3.org/ghpages/examples.html)
13 |
14 | ---
15 |
16 | If one of [the existing models](https://github.com/novus/nvd3/tree/master/src/models) doesn't meet your needs, fork the project, implement the model and an example using it, send us a pull request, for consideration for inclusion in the project.
17 |
18 | ---
19 |
20 | Minifying your fork:
21 |
22 | The Makefile requires [UglifyJS](https://github.com/mishoo/UglifyJS).
23 |
24 | The easist way to install is to install via npm. Run `npm install
25 | uglify-js` from your home directory, then add the output from `npm bin`
26 | into your path so that you have access to `uglifyjs` from the command
27 | line (remember to restart your terminal window when adding to the path.)
28 |
29 | Once you have `uglifyjs` command available, running `make` from your
30 | fork's root directory will rebuild both `nv.d3.js` and `nv.d3.min.js`.
31 |
32 | Without UglifyJS, you won't get the minified version when running make.
33 |
34 | **We ask that you DO NOT minify pull requests...
35 | If you need to minify please build pull request in separate branch, and
36 | merge and minify in yout master.
37 |
38 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/build.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | copy src\intro.js /B + src\core.js /B + src\tooltip.js /B temp1.js /B
3 | copy src\models\*.js /B temp2.js /B
4 | copy temp1.js /B + temp2.js /B + src\outro.js /B nv.d3.js /B
5 | del temp1.js
6 | del temp2.js
7 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | COMPRESSOR=`which yui-compressor`
3 | cat src/intro.js src/core.js src/tooltip.js src/utils.js src/models/*.js src/outro.js > nv.d3.js
4 | if [ -e $COMPRESSOR ]; then
5 | $COMPRESSOR --type js -o nv.d3.min.js nv.d3.js
6 | fi
7 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/deprecated/lineChart-old.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
84 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/images/grey-minus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch09/web/static/nvd3/examples/images/grey-minus.png
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/images/grey-plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rjurney/Agile_Data_Code/5a5a5f11de5ed0c4949bf1e7c80fba329fc72120/ch09/web/static/nvd3/examples/images/grey-plus.png
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/legend.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
76 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/line.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
96 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/lineWithFocusChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
88 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/multiBar.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
93 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/multiBarChart.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
81 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/pie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
94 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/sparkline.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
21 |
22 |
23 | Sparkline:
24 |
25 |
26 |
27 |
28 |
29 |
63 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/sparklinePlus.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
18 |
19 |
20 | SparklinePlus:
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
68 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/examples/stream_layers.js:
--------------------------------------------------------------------------------
1 |
2 | /* Inspired by Lee Byron's test data generator. */
3 | function stream_layers(n, m, o) {
4 | if (arguments.length < 3) o = 0;
5 | function bump(a) {
6 | var x = 1 / (.1 + Math.random()),
7 | y = 2 * Math.random() - .5,
8 | z = 10 / (.1 + Math.random());
9 | for (var i = 0; i < m; i++) {
10 | var w = (i / m - y) * z;
11 | a[i] += x * Math.exp(-w * w);
12 | }
13 | }
14 | return d3.range(n).map(function() {
15 | var a = [], i;
16 | for (i = 0; i < m; i++) a[i] = o + o * Math.random();
17 | for (i = 0; i < 5; i++) bump(a);
18 | return a.map(stream_index);
19 | });
20 | }
21 |
22 | /* Another layer generator using gamma distributions. */
23 | function stream_waves(n, m) {
24 | return d3.range(n).map(function(i) {
25 | return d3.range(m).map(function(j) {
26 | var x = 20 * j / m - i / 3;
27 | return 2 * x * Math.exp(-.5 * x);
28 | }).map(stream_index);
29 | });
30 | }
31 |
32 | function stream_index(d, i) {
33 | return {x: i, y: Math.max(0, d)};
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/src/intro.js:
--------------------------------------------------------------------------------
1 | (function(){
2 |
--------------------------------------------------------------------------------
/ch09/web/static/nvd3/src/outro.js:
--------------------------------------------------------------------------------
1 | })();
--------------------------------------------------------------------------------
/ch09/web/templates/partials/emails.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 |
8 |
9 | {% block content -%}
10 | Emails
11 |
14 |
15 |
16 |
17 | From |
18 | Subject |
19 | Date |
20 |
21 |
22 | {% for email in emails %}
23 |
24 | {{ common.display_email_address(email['from'])|safe }} |
25 | {{ common.display_link(email['message_id'], '/email', email['subject'])|safe }} |
26 | {{ email['date'] }} |
27 |
28 | {% endfor %}
29 |
30 |
31 | {% if nav_offsets and nav_path -%}
32 | {{ common.display_nav(nav_offsets, nav_path, query)|safe }}
33 | {% endif -%}
34 |
35 | {% endblock -%}
36 |
--------------------------------------------------------------------------------
/ch09/web/templates/partials/will_reply.html:
--------------------------------------------------------------------------------
1 |
2 | {% extends "layout.html" %}
3 |
4 |
5 | {% import "macros.jnj" as common %}
6 |
7 | {% block content -%}
8 |
9 |
20 |
21 | {{ result }}
22 |
23 | {% endblock -%}
24 |
--------------------------------------------------------------------------------
/pigrc:
--------------------------------------------------------------------------------
1 | /* Setup for Piggybank */
2 | %default PIGGYBANK_LIB '/me/Software/pig/contrib/piggybank/java'
3 | REGISTER $PIBBYBANK_LIB/piggybank.jar
4 |
5 | /* Setup for Avro */
6 | %default PIG_LIB '/me/Software/pig/build/ivy/lib/Pig';
7 | REGISTER $PIG_LIB/avro-1.5.3.jar
8 | REGISTER $PIG_LIB/json-simple-1.1.jar
9 | DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();
10 |
11 | /* Setup for MongoDB */
12 | $default MONGO_LIB '/me/Software/mongo-hadoop/'
13 | REGISTER $MONGO_LIB/mongo-2.10.1.jar
14 | REGISTER $MONGO_LIB/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
15 | REGISTER $MONGO_LIB/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar
16 | DEFINE MongoStorage com.mongodb.hadoop.pig.MongoStorage();
17 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | BareNecessities==0.2.8
2 | ESClient==0.5.3
3 | Flask==0.9
4 | Jinja2==2.6
5 | LEPL==5.1.3
6 | Mail==2.1.0
7 | Werkzeug==0.8.3
8 | distribute==0.6.31
9 | python-snappy
10 | avro==1.7.3
11 | -e git+https://github.com/rhec/pyelasticsearch.git#egg=pyelasticsearch
12 | pymongo==2.4.1
13 | requests==1.0.4
14 | simplejson==2.6.2
15 | wsgiref==0.1.2
16 | numpy
17 | honcho
18 | scipy
19 | dotcloud
20 | python-dateutil
21 | nltk
22 |
--------------------------------------------------------------------------------