├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── LICENSE
├── README.md
├── aws
    ├── aws_launcher.py
    └── userdata.sh
├── bin
    ├── build_pipeline_heka.sh
    └── install_dependencies.osx.sh
├── doc
    └── derived_streams.md
├── examples
    ├── basic_local_pipeline.toml
    ├── decode_telemetry.toml
    ├── monitor_dnt.lua
    ├── payload_size_devel.toml
    ├── payload_size_devel_filter.json
    └── request_rates.lua
├── heka
    ├── cmd
    │   ├── heka-export
    │   │   └── main.go
    │   ├── heka-s3cat
    │   │   └── main.go
    │   ├── heka-s3list
    │   │   └── main.go
    │   └── s3cat
    │   │   └── main.go
    ├── patches
    │   ├── 0002-Add-cmdline-tool-for-uploading-to-S3.patch
    │   └── 0003-Add-more-cmds.patch
    ├── plugins
    │   ├── fx
    │   │   ├── common.c
    │   │   ├── common.h
    │   │   ├── executive_report.c
    │   │   ├── xxhash.c
    │   │   └── xxhash.h
    │   ├── hash
    │   │   └── lua_hash.c
    │   ├── kafkaconsumergroup
    │   │   ├── kafka_consumer_group_input.go
    │   │   └── kafka_consumer_group_input_test.go
    │   ├── s3splitfile
    │   │   ├── all_specs_test.go
    │   │   ├── s3offset_input.go
    │   │   ├── s3splitfile_common.go
    │   │   ├── s3splitfile_common_test.go
    │   │   ├── s3splitfile_input.go
    │   │   ├── s3splitfile_output.go
    │   │   └── testsupport
    │   │   │   └── schema.json
    │   ├── snap
    │   │   ├── snappy_decoder.go
    │   │   └── snappy_encoder.go
    │   └── sqs3
    │   │   └── sqs3_input.go
    └── sandbox
    │   ├── decoders
    │       ├── decompress_payload.lua
    │       ├── extract_executive_summary.lua
    │       ├── extract_fhr_dimensions.lua
    │       ├── extract_telemetry_dimensions.lua
    │       ├── extract_tls_info.lua
    │       └── http_edge_decoder.lua
    │   ├── encoders
    │       ├── combine_telemetry_objects.lua
    │       └── es_fields.lua
    │   └── filters
    │       ├── count_by_normalized_channel.lua
    │       ├── dollars.lua
    │       ├── fhr_requests.lua
    │       ├── firefox_active_instances.lua
    │       ├── firefox_aggregator.lua
    │       ├── firefox_channel_switching.lua
    │       ├── firefox_duplicates.lua
    │       ├── firefox_executive_report.lua
    │       ├── firefox_searches.lua
    │       ├── firefox_usage.lua
    │       ├── payload_size.lua
    │       ├── telemetry_decoder_view.lua
    │       ├── telemetry_latency.lua
    │       ├── telemetry_requests.lua
    │       ├── telemetry_s3output_monitors.lua
    │       └── telemetry_webrtc.lua
├── hindsight
    ├── analysis
    │   └── landfill_error.lua
    ├── input
    │   ├── heka_s3.lua
    │   ├── serverlog_s3_bootstrap.lua
    │   ├── telemetry_s3_bootstrap.lua
    │   ├── telemetry_s3_landfill.lua
    │   └── telemetry_s3_snappy.lua
    ├── io_modules
    │   ├── derived_stream.lua
    │   ├── derived_stream
    │   │   ├── heka_protobuf.lua
    │   │   ├── redshift.lua
    │   │   ├── redshift
    │   │   │   ├── psv.lua
    │   │   │   └── sql.lua
    │   │   └── tsv.lua
    │   └── telemetry
    │   │   └── s3.lua
    ├── modules
    │   ├── agg.lua
    │   ├── fx.lua
    │   └── fx
    │   │   └── ping.lua
    └── output
    │   ├── cbuf2tsv.lua
    │   ├── crash_summary.lua
    │   ├── executive_summary.lua
    │   ├── executive_summary_full.lua
    │   ├── main_summary.lua
    │   └── telemetry_s3.lua
└── reports
    ├── budget
        ├── budget.toml
        ├── check_targets.py
        ├── package.sh
        ├── run.sh
        └── schema_template.json
    ├── crash_stats_oom
        └── getting-crash-stats-for-OOM-data-to-S3.ipynb
    ├── derived_streams
        ├── derived_streams
        │   ├── hindsight.cfg
        │   ├── run
        │   │   ├── input
        │   │   │   ├── popen.lua
        │   │   │   ├── popen01.cfg
        │   │   │   ├── popen02.cfg
        │   │   │   ├── popen03.cfg
        │   │   │   ├── popen04.cfg
        │   │   │   ├── popen05.cfg
        │   │   │   ├── popen06.cfg
        │   │   │   ├── popen07.cfg
        │   │   │   ├── popen08.cfg
        │   │   │   ├── popen09.cfg
        │   │   │   ├── popen10.cfg
        │   │   │   ├── popen11.cfg
        │   │   │   ├── popen12.cfg
        │   │   │   ├── popen13.cfg
        │   │   │   ├── popen14.cfg
        │   │   │   ├── popen15.cfg
        │   │   │   ├── popen16.cfg
        │   │   │   ├── prune_input.cfg
        │   │   │   └── prune_input.lua
        │   │   └── output
        │   │   │   ├── crash_summary.lua
        │   │   │   ├── crash_summary01.cfg
        │   │   │   ├── crash_summary02.cfg
        │   │   │   ├── crash_summary03.cfg
        │   │   │   ├── crash_summary04.cfg
        │   │   │   ├── crash_summary05.cfg
        │   │   │   ├── crash_summary06.cfg
        │   │   │   ├── crash_summary07.cfg
        │   │   │   ├── crash_summary08.cfg
        │   │   │   ├── crash_summary09.cfg
        │   │   │   ├── crash_summary10.cfg
        │   │   │   ├── crash_summary11.cfg
        │   │   │   ├── crash_summary12.cfg
        │   │   │   ├── crash_summary13.cfg
        │   │   │   ├── crash_summary14.cfg
        │   │   │   ├── crash_summary15.cfg
        │   │   │   ├── crash_summary16.cfg
        │   │   │   ├── executive_summary01.cfg
        │   │   │   ├── executive_summary02.cfg
        │   │   │   ├── executive_summary03.cfg
        │   │   │   ├── executive_summary04.cfg
        │   │   │   ├── executive_summary05.cfg
        │   │   │   ├── executive_summary06.cfg
        │   │   │   ├── executive_summary07.cfg
        │   │   │   ├── executive_summary08.cfg
        │   │   │   ├── executive_summary09.cfg
        │   │   │   ├── executive_summary10.cfg
        │   │   │   ├── executive_summary11.cfg
        │   │   │   ├── executive_summary12.cfg
        │   │   │   ├── executive_summary13.cfg
        │   │   │   ├── executive_summary14.cfg
        │   │   │   ├── executive_summary15.cfg
        │   │   │   ├── executive_summary16.cfg
        │   │   │   ├── executive_summary_full.lua
        │   │   │   ├── main_summary.lua
        │   │   │   ├── main_summary01.cfg
        │   │   │   ├── main_summary02.cfg
        │   │   │   ├── main_summary03.cfg
        │   │   │   ├── main_summary04.cfg
        │   │   │   ├── main_summary05.cfg
        │   │   │   ├── main_summary06.cfg
        │   │   │   ├── main_summary07.cfg
        │   │   │   ├── main_summary08.cfg
        │   │   │   ├── main_summary09.cfg
        │   │   │   ├── main_summary10.cfg
        │   │   │   ├── main_summary11.cfg
        │   │   │   ├── main_summary12.cfg
        │   │   │   ├── main_summary13.cfg
        │   │   │   ├── main_summary14.cfg
        │   │   │   ├── main_summary15.cfg
        │   │   │   └── main_summary16.cfg
        │   ├── schema_template.json
        │   └── splitter.lua
        ├── hindsight
        │   └── bin
        │   │   ├── hindsight
        │   │   └── hindsight_cli
        ├── luasandbox-0.10.2-Linux-core.deb
        ├── package.sh
        ├── run.sh
        └── snappy.so
    ├── engagement_ratio
        ├── MauDau.ipynb
        └── README.txt
    ├── executive_summary
        ├── README.txt
        ├── package.sh
        ├── reformat_v4.py
        ├── run.sh
        └── run_executive_report.py
    ├── fennec_dashboard
        ├── README.txt
        └── summarize_csv.ipynb
    ├── loop
        ├── hindsight.cfg
        └── run
        │   ├── analysis
        │       ├── hll_check.lua
        │       ├── hll_check.off
        │       ├── retention.lua
        │       ├── retention_daily.cfg
        │       ├── retention_monthly.cfg
        │       ├── retention_weekly.cfg
        │       ├── xau.cfg
        │       └── xau.lua
        │   ├── input
        │       ├── server_logs.cfg
        │       └── server_logs.lua
        │   └── output
        │       └── placeholder.off
    ├── socorro_import
        └── ImportCrashData.ipynb
    ├── stability-summary
        ├── rollup.py
        ├── run.sh
        ├── summarize.py
        └── utils.py
    └── update-orphaning
        └── Update orphaning analysis using longitudinal dataset.ipynb


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Scratch dir for building Heka
 2 | build/
 3 | 
 4 | # Operating System gunk
 5 | .DS_Store
 6 | Thumbs.db
 7 | 
 8 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 9 | *.o
10 | *.a
11 | *.so
12 | 
13 | # Folders
14 | _obj
15 | _test
16 | 
17 | # Architecture specific extensions/prefixes
18 | *.[568vq]
19 | [568vq].out
20 | 
21 | *.cgo1.go
22 | *.cgo2.c
23 | _cgo_defun.c
24 | _cgo_gotypes.go
25 | _cgo_export.*
26 | 
27 | _testmain.go
28 | 
29 | *.exe
30 | *.test
31 | *.prof
32 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: go
 2 | go:
 3 |     - 1.4
 4 | notifications:
 5 |     irc:
 6 |         channels:
 7 |             - "irc.mozilla.org#datapipeline"
 8 | before_install:
 9 |     - sudo add-apt-repository ppa:andykimpe/cmake3 -y
10 |     - sudo add-apt-repository ppa:maxmind/ppa -y
11 |     - sudo apt-get update -qq
12 |     - sudo apt-get install -y protobuf-compiler cmake libgeoip-dev libpq-dev
13 | 
14 | install:
15 |     - bash bin/build_pipeline_heka.sh
16 | 
17 | script:
18 |     - cd build/heka
19 |     - . env.sh
20 |     - cd build
21 |     - make
22 |     - go test github.com/mozilla-services/data-pipeline/s3splitfile
23 |     - go test github.com/mozilla-services/data-pipeline/kafkaconsumergroup
24 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Community Participation Guidelines
 2 | 
 3 | This repository is governed by Mozilla's code of conduct and etiquette guidelines. 
 4 | For more details, please read the
 5 | [Mozilla Community Participation Guidelines](https://www.mozilla.org/about/governance/policies/participation/). 
 6 | 
 7 | ## How to Report
 8 | For more information on how to report violations of the Community Participation Guidelines, please read our '[How to Report](https://www.mozilla.org/about/governance/policies/participation/reporting/)' page.
 9 | 
10 | <!--
11 | ## Project Specific Etiquette
12 | 
13 | In some cases, there will be additional project etiquette i.e.: (https://bugzilla.mozilla.org/page.cgi?id=etiquette.html).
14 | Please update for your project.
15 | -->
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Mozilla Services Data Pipeline
 2 | 
 3 | This repository contains the extra bits and pieces needed to build heka
 4 | for use in the [Cloud Services Data Pipeline](https://wiki.mozilla.org/CloudServices/DataPipeline).
 5 | 
 6 | Visit us on irc.mozilla.org in `#datapipeline`.
 7 | 
 8 | ## Building a Data Pipeline RPM
 9 | 
10 | Run `bash bin/build_pipeline_heka.sh` from the top level of this repo to build a heka RPM.
11 | 
12 | ## Using the Data Pipeline
13 | 
14 | If you are simply looking to test out some data analysis plugins and don't want to setup your own pipeline here is the fastest way to get going:
15 | https://mana.mozilla.org/wiki/display/CLOUDSERVICES/Using+the+sandbox+manager+in+the+prod+prototype+pipeline
16 | 
17 | ## Running/Testing Your Own Data Pipeline
18 | 
19 | You can set up a bare-bones data pipeline of your own.  You will get an endpoint that listens for HTTP POST requests, performs GeoIP lookups, and wraps them up in protobuf messages. These messages will be relayed to a stream-processor, and will be output to a local store on disk. There will be basic web-based monitoring, and the ability to add your own stream processing filters.
20 | 
21 | 1. Clone this data-pipeline github repo
22 | 
23 |     ```
24 |     git clone https://github.com/mozilla-services/data-pipeline.git
25 |     ```
26 | 
27 | 2. Build and configure heka. If you are unable to build heka, drop by #datapipeline on irc.mozilla.org and we will try to provide you a pre-built version.
28 |   1. Make sure you have the depencies installed:
29 |     1. OpenSSL v1.0+ (required by lua_openssl)
30 |     2. libpq, the PostgreSQL API
31 |   2. Run `bash bin/build_pipeline_heka.sh`
32 |   3. Install lua modules
33 | 
34 |         ```
35 |         mkdir lua_modules
36 |         rsync -av build/heka/build/heka/lib/luasandbox/modules/ lua_modules/
37 |         rsync -av hindsight/modules/ hindsight/io_modules/ lua_modules/
38 |         ```
39 | 
40 |   4. Procure a `GeoLiteCity.dat` file and put it in the current dir
41 | 
42 |         ```
43 |         wget http://geolite.maxmind.com/download/geoip/database/GeoLiteCity.dat.gz
44 |         gunzip GeoLiteCity.dat.gz
45 |         ```
46 | 
47 | 3. Set up the main Pipeline using the `examples/basic_local_pipeline.toml` config file. This will listen for HTTP POSTs on port 8080, log the raw and decoded messages requests to stdout, run the example filter, and output the records to a file.
48 | 
49 |     ```
50 |     build/heka/build/heka/bin/hekad -config examples/basic_local_pipeline.toml
51 |     ```
52 | 
53 | 4. Check the monitoring dashboard at [http://localhost:4352](http://localhost:4352)
54 | 5. Fire off some test submissions!
55 | 
56 |     ```
57 |     for f in $(seq 1 20); do
58 |       curl -X POST "http://localhost:8080/submit/test/$f/foo/bar/baz" -d "{\"test\":$f}"
59 |     done
60 |     ```
61 | 
62 | 6. Verify that your data was stored in the output file using the `heka-cat` utility
63 | 
64 |     ```
65 |     build/heka/build/heka/bin/heka-cat data_raw.out
66 |     build/heka/build/heka/bin/heka-cat data_decoded.out
67 |     ```
68 | 
69 | 7. Experiment with sandbox filters, outputs, and configurations.
70 | 
71 | ### Useful things to know
72 | 
73 | - GeoIP
74 |   - It’s not terribly interesting to do GeoIP lookups on 127.0.0.1, so you may want to provide a `--header "X-Forwarded-For: 8.8.8.8"` argument to your curl commands. That will force a geoIP lookup on the specified IP address (Google’s DNS server in this example).
75 | - How to configure namespaces
76 |   - The example config allows submissions to either `/submit/telemetry/docid/more/path/stuff` or `/submit/test/id/and/so/on`
77 |   - You can add more endpoints by modifying the `namespace_config` parameter in `basic_local_pipeline.edge.toml`.
78 |   - The namespace config is more manageable if you the JSON in a separate file, and run it through something like `jq -c '.' < my_namespaces.json` before putting it into the toml config.
79 | - Where to get more info about configuring heka
80 |   - http://hekad.readthedocs.org/en/latest/index.html
81 | 


--------------------------------------------------------------------------------
/aws/aws_launcher.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | # ideas from
  9 | # https://github.com/mozilla/telemetry-server/tree/master/provisioning/aws
 10 | 
 11 | import argparse
 12 | import json
 13 | import sys
 14 | import traceback
 15 | import time
 16 | 
 17 | try:
 18 |     import boto.ec2
 19 |     from boto.ec2.blockdevicemapping import BlockDeviceType
 20 |     from boto.ec2.blockdevicemapping import BlockDeviceMapping
 21 | except:
 22 |     sys.stderr.write("Requires boto; try 'pip install boto'\n")
 23 |     exit(1)
 24 | 
 25 | default_config = {
 26 |     "image": "ami-5189a661",
 27 |     "region": "us-west-2",
 28 |     "key_name": "20130730-svcops-base-key-dev",
 29 |     "instance_type": "c3.2xlarge",
 30 |     "security_groups": ["pipeline-analysis"],
 31 |     "iam_role": "pipeline-dev-iam-access-IamInstanceProfile-YVZ950U23IFP",
 32 |     "shutdown": "terminate",
 33 |     "ephemeral_map": {
 34 |         "/dev/xvdb": "ephemeral0",
 35 |         "/dev/xvdc": "ephemeral1"
 36 |     },
 37 |     "owner": "datapipeline",
 38 |     "tags": {
 39 |         "App": "pipeline",
 40 |         "Type": "analysis",
 41 |         "Env": "dev",
 42 |     }
 43 | }
 44 | 
 45 | 
 46 | class Launcher(object):
 47 |     def __init__(self):
 48 |         parser = self.get_arg_parser()
 49 |         args = parser.parse_args()
 50 |         self.read_user_data()
 51 |         self.setup_config(args)
 52 | 
 53 |     def get_arg_parser(self):
 54 |         parser = argparse.ArgumentParser(description='Launch EC2 instances')
 55 |         parser.add_argument(
 56 |             "-c", "--config-file",
 57 |             help="JSON config file",
 58 |             type=file,
 59 |             default=None
 60 |         )
 61 |         parser.add_argument(
 62 |             "-k", "--aws-key",
 63 |             help="AWS Key",
 64 |             default=None
 65 |         )
 66 |         parser.add_argument(
 67 |             "-s", "--aws-secret-key",
 68 |             help="AWS Secret Key",
 69 |             default=None
 70 |         )
 71 |         parser.add_argument(
 72 |             "-o", "--owner",
 73 |             help="AWS owner tag",
 74 |             default=None
 75 |         )
 76 |         return parser
 77 | 
 78 |     def read_user_data(self):
 79 |         with open("userdata.sh", "r") as fh:
 80 |             self.user_data = fh.read()
 81 | 
 82 |     def setup_config(self, args):
 83 |         self.config = default_config.copy()
 84 |         if args.config_file:
 85 |             user_config = json.load(args.config_file)
 86 |             self.config.update(user_config)
 87 |         if args.aws_key:
 88 |             self.config["aws_key"] = args.aws_key
 89 |         if args.aws_secret_key:
 90 |             self.config["aws_secret_key"] = args.aws_secret_key
 91 |         if args.owner:
 92 |             self.config["owner"] = args.owner
 93 | 
 94 |     def fire_up_instance(self):
 95 |         self.conn = boto.ec2.connect_to_region(
 96 |             self.config["region"],
 97 |             aws_access_key_id=self.config.get("aws_key", None),
 98 |             aws_secret_access_key=self.config.get("aws_secret_key", None)
 99 |         )
100 | 
101 |         mapping = BlockDeviceMapping()
102 |         for device, eph_name in self.config["ephemeral_map"].iteritems():
103 |             mapping[device] = BlockDeviceType(ephemeral_name=eph_name)
104 | 
105 |         reservation = self.conn.run_instances(
106 |             self.config["image"],
107 |             key_name=self.config["key_name"],
108 |             instance_type=self.config["instance_type"],
109 |             security_groups=self.config["security_groups"],
110 |             block_device_map=mapping,
111 |             user_data=self.user_data,
112 |             instance_profile_name=self.config["iam_role"],
113 |             instance_initiated_shutdown_behavior=self.config["shutdown"]
114 |         )
115 | 
116 |         instance = reservation.instances[0]
117 | 
118 |         name_string = "{0}-{1}-{2}".format(
119 |             self.config["owner"],
120 |             self.config["tags"]["App"],
121 |             self.config["tags"]["Type"])
122 |         owner_tags = {"Name": name_string, "Owner": self.config["owner"]}
123 |         self.conn.create_tags([instance.id], owner_tags)
124 |         self.conn.create_tags([instance.id], self.config["tags"])
125 | 
126 |         while instance.state == 'pending':
127 |             print "Instance is pending -- Waiting 10s for instance", \
128 |                 instance.id, "to start up..."
129 |             time.sleep(10)
130 |             instance.update()
131 | 
132 |         print ("Instance {0} is {1}".format(instance.id, instance.state))
133 |         print ("ubuntu@{0}".format(instance.public_dns_name))
134 | 
135 | 
136 | def main():
137 |     try:
138 |         launcher = Launcher()
139 |         launcher.fire_up_instance()
140 |         return 0
141 |     except Exception, e:
142 |         print "Error:", e
143 |         traceback.print_exc()
144 |         return 1
145 | 
146 | if __name__ == "__main__":
147 |     sys.exit(main())
148 | 


--------------------------------------------------------------------------------
/aws/userdata.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #sudo su -
 4 | # As of 2015/09/10, build requires geoip >= 1.6.3.
 5 | # Add a PPA for recent versions.
 6 | add-apt-repository ppa:maxmind/ppa -y
 7 | 
 8 | apt-get update
 9 | apt-get --yes install mdadm xfsprogs jq git python-pip python-protobuf cmake libgeoip-dev zlib1g-dev mercurial debhelper libpq-dev libssl-dev
10 | pip install awscli boto
11 | umount /mnt
12 | yes | mdadm --create /dev/md0 --level=0 -c64 --raid-devices=2 /dev/xvdb /dev/xvdc
13 | echo 'DEVICE /dev/xvdb /dev/xvdc' >> /etc/mdadm/mdadm.conf
14 | mdadm --detail --scan >> /etc/mdadm/mdadm.conf
15 | mkfs.xfs /dev/md0
16 | mount /dev/md0 /mnt
17 | mkdir -p /mnt/work
18 | chown -R ubuntu:ubuntu /mnt/work
19 | 
20 | cd /mnt/work
21 | wget https://storage.googleapis.com/golang/go1.4.2.linux-amd64.tar.gz
22 | tar -C /usr/local -xzf go1.4.2.linux-amd64.tar.gz
23 | 
24 | wget http://people.mozilla.org/~mreid/heka-data-pipeline-linux-amd64.tar.gz
25 | tar xzvf heka-data-pipeline-linux-amd64.tar.gz
26 | 
27 | echo "ubuntu hard nofile 200000" >> /etc/security/limits.conf
28 | echo "ubuntu soft nofile 50000" >> /etc/security/limits.conf
29 | 


--------------------------------------------------------------------------------
/bin/install_dependencies.osx.sh:
--------------------------------------------------------------------------------
1 | # As of 20160421, the latest cmake won't work. Install cmake 3.1 as a workaround.
2 | brew install openssl protobuf postgresql homebrew/versions/cmake31
3 | if [ -z "$(which go)" ]; then
4 | 	echo "You'll need to install go 1.4.x - see https://golang.org/dl/"
5 | fi
6 | 


--------------------------------------------------------------------------------
/doc/derived_streams.md:
--------------------------------------------------------------------------------
 1 | ## Creating a derived stream
 2 | 
 3 | - Follow the steps in the [`README`](../README.md) to set up a local pipeline.
 4 | - Create a Sandbox Filter to extract the information you want. A simple example is the [`payload_size.lua`](../heka/sandbox/filters/payload_size.lua) filter.
 5 | - Create a configuration file to test the filter during development. See [`payload_size_devel.toml`](../examples/payload_size_devel.toml) for an example config. For a derived stream based on Telemetry data, you will most likely use a `S3SplitFileInput` to read production data, and a `LogOutput` or `FileOutput` to view the resulting records locally.
 6 | - Create a [JSON filter](../examples/payload_size_devel_filter.json) to limit the input data to a reasonable amount for testing.
 7 | - Run it:
 8 | ```bash
 9 | export PATH=$PATH:build/heka/build/heka/bin
10 | hekad -config examples/payload_size_devel.toml
11 | # You should see several "payload_size" messages logged to the console.
12 | # Check the resulting file output:
13 | heka-cat derived_data.out
14 | ```
15 | 
16 | 


--------------------------------------------------------------------------------
/examples/basic_local_pipeline.toml:
--------------------------------------------------------------------------------
 1 | [hekad]
 2 | base_dir = "."
 3 | share_dir = "."
 4 | # 8MB
 5 | max_message_size = 8388608
 6 | 
 7 | [RstEncoder]
 8 | 
 9 | [TestInput]
10 | type = "HttpListenInput"
11 | address = "127.0.0.1:8080"
12 | request_headers = ["Content-Length", "X-Forwarded-For", "DNT", "Date"]
13 | decoder = "HttpEdgeDecoder"
14 | send_decode_failures = true
15 | 
16 | [LogOutput]
17 | # Print all incoming http messages (both raw and decoded)
18 | type = "LogOutput"
19 | message_matcher = "Type == 'http_edge_incoming' || Type == 'heka.httpdata.request'"
20 | #message_matcher = "TRUE"
21 | encoder = "RstEncoder"
22 | 
23 | [HttpEdgeDecoder]
24 | type = "SandboxDecoder"
25 | filename = "heka/sandbox/decoders/http_edge_decoder.lua"
26 | memory_limit = 90000000
27 | output_limit = 8388608
28 |     [HttpEdgeDecoder.config]
29 |     geoip_city_db = "GeoLiteCity.dat"
30 |     namespace_config = '{"test":{"logger":"test_input","max_path_length":20480,"max_data_length":1048576},"telemetry":{"dimensions":["reason","appName","appVersion","appUpdateChannel","appBuildID"],"max_path_length":10240,"max_data_length":204800}}'
31 | 
32 | [DashboardOutput]
33 | address = "localhost:4352"
34 | static_directory = "build/heka/dasher"
35 | ticker_interval = 1
36 | 
37 | [PayloadEncoder]
38 | 
39 | [RequestRates]
40 | type = "SandboxFilter"
41 | message_matcher = "Type == 'http_edge_incoming'"
42 | filename = "examples/request_rates.lua"
43 | ticker_interval = 10
44 | preserve_data = true
45 | output_limit = 256000
46 | 
47 | [ProtobufEncoder]
48 | 
49 | [ArchivePipelineOutput]
50 | type = "FileOutput"
51 | path = "./data_decoded.out"
52 | use_framing = true
53 | message_matcher = "Type == 'http_edge_incoming'"
54 | encoder = "ProtobufEncoder"
55 | 
56 | [ArchiveRawOutput]
57 | type = "FileOutput"
58 | path = "./data_raw.out"
59 | use_framing = true
60 | message_matcher = "Type == 'heka.httpdata.request'"
61 | encoder = "ProtobufEncoder"
62 | 


--------------------------------------------------------------------------------
/examples/decode_telemetry.toml:
--------------------------------------------------------------------------------
 1 | [hekad]
 2 | base_dir = "."
 3 | share_dir = "."
 4 | # 8MB
 5 | max_message_size = 8388608
 6 | 
 7 | [RstEncoder]
 8 | 
 9 | [TestInput]
10 | type = "HttpListenInput"
11 | address = "127.0.0.1:8080"
12 | request_headers = ["Content-Length", "X-Forwarded-For", "DNT", "Date"]
13 | decoder = "TelemetryDecoders"
14 | send_decode_failures = true
15 | 
16 | [LogOutput]
17 | # Print all incoming http messages (both raw and decoded)
18 | type = "LogOutput"
19 | #message_matcher = "Type == 'http_edge_incoming' || Type == 'heka.httpdata.request'"
20 | #message_matcher = "TRUE"
21 | message_matcher = "Logger != 'hekad'"
22 | encoder = "RstEncoder"
23 | 
24 | [TelemetryDecoders]
25 | type = "MultiDecoder"
26 | subs = ["HttpEdgeDecoder", "TelemetryDecoder" , "ExecutiveSummary"]
27 | cascade_strategy = "all"
28 | log_sub_errors = true
29 | 
30 | [HttpEdgeDecoder]
31 | type = "SandboxDecoder"
32 | filename = "heka/sandbox/decoders/http_edge_decoder.lua"
33 | memory_limit = 90000000
34 | output_limit = 8388608
35 |     [HttpEdgeDecoder.config]
36 |     geoip_city_db = "GeoLiteCity.dat"
37 |     namespace_config = '{"test":{"logger":"test_input","max_path_length":20480,"max_data_length":1048576},"telemetry":{"dimensions":["docType","appName","appVersion","appUpdateChannel","appBuildId"],"max_path_length":10240,"max_data_length":204800},"sslreports":{"max_path_length":1024,"max_data_length":1048576}}'
38 | 
39 | [TelemetryDecoder]
40 | type = "SandboxDecoder"
41 | filename = "heka/sandbox/decoders/extract_telemetry_dimensions.lua"
42 | memory_limit = 90000000
43 | output_limit = 2097152
44 |     [TelemetryDecoder.config]
45 |     duplicate_original = true
46 | 
47 | [ExecutiveSummary]
48 | type = "SandboxDecoder"
49 | filename = "heka/sandbox/decoders/extract_executive_summary.lua"
50 | memory_limit = 90000000
51 | output_limit = 2097152
52 |     [ExecutiveSummary.config]
53 |     duplicate_original = true
54 | 
55 | [DashboardOutput]
56 | address = "localhost:4352"
57 | static_directory = "build/heka/dasher"
58 | ticker_interval = 1
59 | 
60 | [PayloadEncoder]
61 | [ProtobufEncoder]
62 | 
63 | [TelemetryDecodedOutput]
64 | type = "FileOutput"
65 | path = "./data_decoded.out"
66 | use_framing = true
67 | message_matcher = "Logger == 'telemetry' && Type == 'telemetry'"
68 | encoder = "ProtobufEncoder"
69 | 
70 | [TelemetryErrorOutput]
71 | type = "FileOutput"
72 | path = "./data_errors.out"
73 | use_framing = true
74 | message_matcher = "Logger == 'telemetry' && Type == 'telemetry.error'"
75 | encoder = "ProtobufEncoder"
76 | 
77 | [TelemetryExecutiveSummaryOutput]
78 | type = "FileOutput"
79 | path = "./data_exsum.out"
80 | use_framing = true
81 | message_matcher = "Logger == 'fx' && Type == 'executive_summary'"
82 | encoder = "ProtobufEncoder"
83 | 


--------------------------------------------------------------------------------
/examples/monitor_dnt.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Monitor DNT header status
 7 | 
 8 | *Example Heka Configuration*
 9 | 
10 | .. code-block:: ini
11 | 
12 |     [DNTUsage]
13 |     type = "SandboxFilter"
14 |     filename = "examples/monitor_dnt.lua"
15 |     message_matcher = "Type == 'telemetry'"
16 |     ticker_interval = 10
17 |     preserve_data = true
18 |         [DNTUsage.config]
19 |         # Increment this if the format changes in a
20 |         # backwards-incompatible way
21 |         preservation_version = 1
22 |         # Number of entries to keep in the circular buffer
23 |         rows = 1440
24 |         # Length of each bucket in the circular buffer
25 |         sec_per_row = 300
26 | 
27 | --]]
28 | _PRESERVATION_VERSION = read_config("preservation_version") or 0
29 | 
30 | require "circular_buffer"
31 | 
32 | -- Default to 2880 minute-long intervals
33 | local rows = read_config("rows") or 2880
34 | local sec_per_row = read_config("sec_per_row") or 60
35 | 
36 | -- Create a circular buffer with three columns. It must
37 | -- be a global variable in order for 'preserve_data' to
38 | -- have any effect.
39 | c = circular_buffer.new(rows, 3, sec_per_row, true)
40 | 
41 | -- Set the header names for the columns
42 | local ON  = c:set_header(1, "DNT On")
43 | local OFF = c:set_header(2, "DNT Off")
44 | local UNK = c:set_header(3, "DNT Unknown")
45 | 
46 | function process_message ()
47 |     local ts = read_message("Timestamp")
48 |     local item = read_message("Fields[DNT]")
49 | 
50 |     if item == "1" then
51 |         c:add(ts, ON, 1)
52 |     elseif item == "0" then
53 |         c:add(ts, OFF, 1)
54 |     else
55 |         c:add(ts, UNK, 1)
56 |     end
57 | 
58 |     return 0
59 | end
60 | 
61 | function timer_event(ns)
62 |     -- Inject the entire circular buffer
63 |     inject_payload("cbuf", "DNT Status", c:format("cbuf"))
64 | 
65 |     -- Inject the cbuf delta (changes since last timer event)
66 |     inject_payload("cbufd", "DNT Status", c:format("cbufd"))
67 | end
68 | 


--------------------------------------------------------------------------------
/examples/payload_size_devel.toml:
--------------------------------------------------------------------------------
 1 | [hekad]
 2 | base_dir = "."
 3 | share_dir = "."
 4 | # 8MB
 5 | max_message_size = 8388608
 6 | 
 7 | # Decode S3 data
 8 | [SnappyDecoder]
 9 | [Multi]
10 | type = "MultiDecoder"
11 | subs = ["SnappyDecoder", "ProtobufDecoder"]
12 | cascade_strategy = "all"
13 | log_sub_errors = true
14 | 
15 | # Read data from S3
16 | [DevInput]
17 | type = "S3SplitFileInput"
18 | s3_bucket = "net-mozaws-prod-us-west-2-pipeline-data"
19 | s3_bucket_prefix = "telemetry-2"
20 | s3_worker_count = 16
21 | s3_read_timeout = 600
22 | schema_file = "examples/payload_size_devel_filter.json"
23 | decoder = "Multi"
24 | 
25 | # Generate the "payload_size" derived stream messages.
26 | [PayloadSize]
27 | type = "SandboxFilter"
28 | filename = "heka/sandbox/filters/payload_size.lua"
29 | message_matcher = "Type == 'telemetry' && Logger == 'telemetry'"
30 | ticker_interval = 0
31 | preserve_data = false
32 | 
33 | 
34 | # Print both derived-stream messages to the console
35 | [RstEncoder]
36 | [LogOutput]
37 | type = "LogOutput"
38 | message_matcher = "Type == 'heka.sandbox.payload_size'"
39 | encoder = "RstEncoder"
40 | 
41 | # Write derived-stream messages to local disk
42 | [ProtobufEncoder]
43 | [ArchiveOutput]
44 | type = "FileOutput"
45 | path = "./derived_data.out"
46 | use_framing = true
47 | message_matcher = "Logger == 'PayloadSize' && Type == 'heka.sandbox.payload_size'"
48 | encoder = "ProtobufEncoder"
49 | 


--------------------------------------------------------------------------------
/examples/payload_size_devel_filter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     { "field_name": "submissionDate",   "allowed_values": "20151001" },
 5 |     { "field_name": "sourceName",       "allowed_values": "telemetry" },
 6 |     { "field_name": "sourceVersion",    "allowed_values": "4" },
 7 |     { "field_name": "docType",          "allowed_values": "main" },
 8 |     { "field_name": "appName",          "allowed_values": "Firefox" },
 9 |     { "field_name": "appUpdateChannel", "allowed_values": "nightly" },
10 |     { "field_name": "appVersion",       "allowed_values": "42.0a1" },
11 |     { "field_name": "appBuildId",       "allowed_values": "20150629134017" }
12 |   ]
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/examples/request_rates.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | _PRESERVATION_VERSION = 1
 5 | 
 6 | require "circular_buffer"
 7 | 
 8 | local title             = "Requests"
 9 | local rows              = read_config("rows") or 14400
10 | local sec_per_row       = read_config("sec_per_row") or 60
11 | 
12 | cbuf = circular_buffer.new(rows, 1, sec_per_row)
13 | cbuf:set_header(1, "Requests")
14 | 
15 | function process_message ()
16 |     cbuf:add(read_message("Timestamp"), 1, 1)
17 |     return 0
18 | end
19 | 
20 | function timer_event(ns)
21 |     inject_payload("cbuf", title, cbuf)
22 | end
23 | 


--------------------------------------------------------------------------------
/heka/cmd/heka-s3list/main.go:
--------------------------------------------------------------------------------
 1 | /***** BEGIN LICENSE BLOCK *****
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
 5 | # ***** END LICENSE BLOCK *****/
 6 | 
 7 | /*
 8 | 
 9 | A command-line utility for listing files on Amazon S3, filtered by dimension.
10 | 
11 | */
12 | package main
13 | 
14 | import (
15 | 	"flag"
16 | 	"fmt"
17 | 	"github.com/AdRoll/goamz/aws"
18 | 	"github.com/AdRoll/goamz/s3"
19 | 	"github.com/mozilla-services/data-pipeline/s3splitfile"
20 | 	"os"
21 | 	"time"
22 | )
23 | 
24 | func main() {
25 | 	flagSchema := flag.String("schema", "", "Filename of the schema to use as a filter")
26 | 	flagBucket := flag.String("bucket", "default-bucket", "S3 Bucket name")
27 | 	flagBucketPrefix := flag.String("bucket-prefix", "", "S3 Bucket path prefix")
28 | 	flagAWSKey := flag.String("aws-key", "", "AWS Key")
29 | 	flagAWSSecretKey := flag.String("aws-secret-key", "", "AWS Secret Key")
30 | 	flagAWSRegion := flag.String("aws-region", "us-west-2", "AWS Region")
31 | 	flagDryRun := flag.Bool("dry-run", false, "Don't actually do anything, just output what would be done")
32 | 	flagVerbose := flag.Bool("verbose", false, "Print detailed info")
33 | 	flag.Parse()
34 | 
35 | 	if flag.NArg() != 0 {
36 | 		flag.PrintDefaults()
37 | 		os.Exit(1)
38 | 	}
39 | 
40 | 	var err error
41 | 	var schema s3splitfile.Schema
42 | 	schema, err = s3splitfile.LoadSchema(*flagSchema)
43 | 	if err != nil {
44 | 		fmt.Printf("schema: %s\n", err)
45 | 		os.Exit(2)
46 | 	}
47 | 
48 | 	if *flagDryRun {
49 | 		fmt.Printf("Dry Run: Would have listed files in s3://%s/%s according to filter schema %s\n",
50 | 			*flagBucket, *flagBucketPrefix, *flagSchema)
51 | 		os.Exit(0)
52 | 	}
53 | 
54 | 	var b *s3.Bucket
55 | 
56 | 	prefix := s3splitfile.CleanBucketPrefix(*flagBucketPrefix)
57 | 
58 | 	// Initialize the S3 bucket
59 | 	auth, err := aws.GetAuth(*flagAWSKey, *flagAWSSecretKey, "", time.Now())
60 | 	if err != nil {
61 | 		fmt.Printf("Authentication error: %s\n", err)
62 | 		os.Exit(4)
63 | 	}
64 | 	region, ok := aws.Regions[*flagAWSRegion]
65 | 	if !ok {
66 | 		fmt.Printf("Parameter 'aws-region' must be a valid AWS Region\n")
67 | 		os.Exit(5)
68 | 	}
69 | 	s := s3.New(auth, region)
70 | 	b = s.Bucket(*flagBucket)
71 | 
72 | 	var errCount int
73 | 	var totalCount int
74 | 	var totalSize int64
75 | 
76 | 	startTime := time.Now().UTC()
77 | 
78 | 	// List the keys as we see them
79 | 	for k := range s3splitfile.S3Iterator(b, prefix, schema) {
80 | 		if k.Err != nil {
81 | 			fmt.Printf("ERROR fetching key: %s\n", k.Err)
82 | 			errCount++
83 | 		} else {
84 | 			totalCount++
85 | 			totalSize += k.Key.Size
86 | 			fmt.Printf("%s\n", k.Key.Key)
87 | 		}
88 | 	}
89 | 
90 | 	duration := time.Now().UTC().Sub(startTime).Seconds()
91 | 
92 | 	if *flagVerbose {
93 | 		fmt.Printf("Filter matched %d files totaling %s in %.02fs (%d errors)\n",
94 | 			totalCount, s3splitfile.PrettySize(totalSize), duration, errCount)
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/heka/cmd/s3cat/main.go:
--------------------------------------------------------------------------------
  1 | /***** BEGIN LICENSE BLOCK *****
  2 | # This Source Code Form is subject to the terms of the Mozilla Public
  3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
  4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
  5 | # ***** END LICENSE BLOCK *****/
  6 | 
  7 | /*
  8 | 
  9 | A command-line utility for fetching a set of files on Amazon S3 as a single data
 10 | stream.
 11 | 
 12 | */
 13 | package main
 14 | 
 15 | import (
 16 | 	"bufio"
 17 | 	"flag"
 18 | 	"fmt"
 19 | 	"github.com/AdRoll/goamz/aws"
 20 | 	"github.com/AdRoll/goamz/s3"
 21 | 	"io"
 22 | 	"math"
 23 | 	"os"
 24 | 	"time"
 25 | )
 26 | 
 27 | var bytesRead uint64
 28 | 
 29 | func main() {
 30 | 	flagStdin := flag.Bool("stdin", false, "read list of s3 key names from stdin")
 31 | 	flagBucket := flag.String("bucket", "default-bucket", "S3 Bucket name")
 32 | 	flagAWSKey := flag.String("aws-key", "", "AWS Key")
 33 | 	flagAWSSecretKey := flag.String("aws-secret-key", "", "AWS Secret Key")
 34 | 	flagAWSRegion := flag.String("aws-region", "us-west-2", "AWS Region")
 35 | 	flagConnectTimeout := flag.Uint64("connect_timeout", 60, "Max seconds to wait for an S3 connection")
 36 | 	flagReadTimeout := flag.Uint64("read_timeout", 300, "Max seconds to wait for an S3 file read to complete")
 37 | 	flag.Parse()
 38 | 
 39 | 	if !*flagStdin && flag.NArg() < 1 {
 40 | 		flag.PrintDefaults()
 41 | 		os.Exit(1)
 42 | 	}
 43 | 
 44 | 	var connectTimeout uint32
 45 | 	if *flagConnectTimeout < math.MaxUint32 {
 46 | 		connectTimeout = uint32(*flagConnectTimeout)
 47 | 	} else {
 48 | 		fmt.Fprintf(os.Stderr, "Connection Timeout is too large:%d.\n", flagConnectTimeout)
 49 | 		os.Exit(8)
 50 | 	}
 51 | 
 52 | 	var readTimeout uint32
 53 | 	if *flagReadTimeout < math.MaxUint32 {
 54 | 		readTimeout = uint32(*flagReadTimeout)
 55 | 	} else {
 56 | 		fmt.Fprintf(os.Stderr, "Read Timeout is too large:%d.\n", flagReadTimeout)
 57 | 		os.Exit(8)
 58 | 	}
 59 | 
 60 | 	auth, err := aws.GetAuth(*flagAWSKey, *flagAWSSecretKey, "", time.Now())
 61 | 	if err != nil {
 62 | 		fmt.Fprintf(os.Stderr, "Authentication error: %s\n", err)
 63 | 		os.Exit(4)
 64 | 	}
 65 | 	region, ok := aws.Regions[*flagAWSRegion]
 66 | 	if !ok {
 67 | 		fmt.Fprintf(os.Stderr, "Parameter 'aws-region' must be a valid AWS Region\n")
 68 | 		os.Exit(5)
 69 | 	}
 70 | 	s := s3.New(auth, region)
 71 | 	if connectTimeout > 0 {
 72 | 		s.ConnectTimeout = time.Duration(connectTimeout) * time.Second
 73 | 	}
 74 | 	if readTimeout > 0 {
 75 | 		s.ReadTimeout = time.Duration(readTimeout) * time.Second
 76 | 	}
 77 | 	bucket := s.Bucket(*flagBucket)
 78 | 
 79 | 	startTime := time.Now().UTC()
 80 | 	totalFiles := 0
 81 | 	if *flagStdin {
 82 | 		scanner := bufio.NewScanner(os.Stdin)
 83 | 		for scanner.Scan() {
 84 | 			filename := scanner.Text()
 85 | 			totalFiles++
 86 | 			cat(bucket, filename)
 87 | 		}
 88 | 	} else {
 89 | 		for _, filename := range flag.Args() {
 90 | 			totalFiles++
 91 | 			cat(bucket, filename)
 92 | 		}
 93 | 	}
 94 | 
 95 | 	duration := time.Now().UTC().Sub(startTime).Seconds()
 96 | 	mb := float64(bytesRead) / 1024.0 / 1024.0
 97 | 	if duration == 0.0 {
 98 | 		duration = 1.0
 99 | 	}
100 | 	fmt.Fprintf(os.Stderr, "All done processing %d files, %.2fMB in %.2f seconds (%.2fMB/s)\n", totalFiles, mb, duration, (mb / duration))
101 | }
102 | 
103 | // Cat the data from a single S3 key
104 | func cat(bucket *s3.Bucket, s3Key string) {
105 | 	var lastGoodOffset uint64
106 | 
107 | RetryS3:
108 | 	for attempt := 1; attempt <= 5; attempt++ {
109 | 		rc, err := getS3Reader(bucket, s3Key, lastGoodOffset)
110 | 		if err != nil && err != io.EOF {
111 | 			fmt.Fprintf(os.Stderr, "Error in attempt %d reading %s at offset %d: %s\n", attempt, s3Key, lastGoodOffset, err)
112 | 			continue RetryS3
113 | 		} else {
114 | 			nr := bufio.NewReader(rc)
115 | 			n, err := nr.WriteTo(os.Stdout)
116 | 			if err != nil && err != io.EOF {
117 | 				fmt.Fprintf(os.Stderr, "Error in attempt %d writing %s at offset %d: %s\n", attempt, s3Key, lastGoodOffset, err)
118 | 				rc.Close()
119 | 				if err.Error() == "write /dev/stdout: broken pipe" {
120 | 					os.Exit(1)
121 | 				}
122 | 				continue RetryS3
123 | 			}
124 | 			lastGoodOffset += uint64(n)
125 | 			bytesRead += uint64(n)
126 | 		}
127 | 		rc.Close()
128 | 		break
129 | 	}
130 | }
131 | 
132 | // Callers must call Close() on rc.
133 | func getS3Reader(bucket *s3.Bucket, s3Key string, offset uint64) (rc io.ReadCloser, err error) {
134 | 	if offset == 0 {
135 | 		rc, err = bucket.GetReader(s3Key)
136 | 		return
137 | 	}
138 | 
139 | 	headers := map[string][]string{
140 | 		"Range": []string{fmt.Sprintf("bytes=%d-", offset)},
141 | 	}
142 | 
143 | 	resp, err := bucket.GetResponseWithHeaders(s3Key, headers)
144 | 
145 | 	if resp != nil {
146 | 		rc = resp.Body
147 | 	}
148 | 	return
149 | }
150 | 


--------------------------------------------------------------------------------
/heka/patches/0002-Add-cmdline-tool-for-uploading-to-S3.patch:
--------------------------------------------------------------------------------
 1 | From 70654e1d8f917f5e97a8305dd5a632ebe086d252 Mon Sep 17 00:00:00 2001
 2 | From: Mark Reid <mreid@mozilla.com>
 3 | Date: Mon, 12 Jan 2015 09:25:48 -0400
 4 | Subject: [PATCH] Add cmdline tool for uploading to S3.
 5 | 
 6 | ---
 7 |  CMakeLists.txt          |   8 +++
 8 |  1 file changed, 8 insertions(+)
 9 | 
10 | diff --git a/CMakeLists.txt b/CMakeLists.txt
11 | index 868bf50..602deac 100644
12 | --- a/CMakeLists.txt
13 | +++ b/CMakeLists.txt
14 | @@ -36,6 +36,7 @@ set(SBMGRLOAD_EXE "${PROJECT_PATH}/bin/heka-sbmgrload${CMAKE_EXECUTABLE_SUFFIX}"
15 |  set(INJECT_EXE "${PROJECT_PATH}/bin/heka-inject${CMAKE_EXECUTABLE_SUFFIX}")
16 |  set(LOGSTREAMER_EXE "${PROJECT_PATH}/bin/heka-logstreamer${CMAKE_EXECUTABLE_SUFFIX}")
17 |  set(HEKA_CAT_EXE "${PROJECT_PATH}/bin/heka-cat${CMAKE_EXECUTABLE_SUFFIX}")
18 | +set(HEKA_EXPORT_EXE "${PROJECT_PATH}/bin/heka-export${CMAKE_EXECUTABLE_SUFFIX}")
19 |  
20 |  option(INCLUDE_SANDBOX "Include Lua sandbox" on)
21 |  option(INCLUDE_MOZSVC "Include the Mozilla services plugins" on)
22 | @@ -217,6 +218,13 @@ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
23 |  
24 |  install(PROGRAMS "${HEKA_CAT_EXE}" DESTINATION bin)
25 |  
26 | +add_custom_target(heka-export ALL
27 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-export
28 | +DEPENDS hekad
29 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
30 | +
31 | +install(PROGRAMS "${HEKA_EXPORT_EXE}" DESTINATION bin)
32 | +
33 |  add_custom_target(sbmgr ALL
34 |  ${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-sbmgr
35 |  DEPENDS hekad)
36 | 


--------------------------------------------------------------------------------
/heka/patches/0003-Add-more-cmds.patch:
--------------------------------------------------------------------------------
 1 | From b31a2ce9ab6d3f5cf744c8234fd145ae5c14a786 Mon Sep 17 00:00:00 2001
 2 | From: Mark Reid <mreid@mozilla.com>
 3 | Date: Wed, 4 Feb 2015 17:10:10 -0400
 4 | Subject: [PATCH] Update build to include heka-s3list and heka-s3cat
 5 | 
 6 | ---
 7 |  CMakeLists.txt | 16 ++++++++++++++++
 8 |  1 file changed, 16 insertions(+)
 9 | 
10 | diff --git a/CMakeLists.txt b/CMakeLists.txt
11 | index a5cdd21..705d223 100644
12 | --- a/CMakeLists.txt
13 | +++ b/CMakeLists.txt
14 | @@ -37,6 +37,9 @@ set(INJECT_EXE "${PROJECT_PATH}/bin/heka-inject${CMAKE_EXECUTABLE_SUFFIX}")
15 |  set(LOGSTREAMER_EXE "${PROJECT_PATH}/bin/heka-logstreamer${CMAKE_EXECUTABLE_SUFFIX}")
16 |  set(HEKA_CAT_EXE "${PROJECT_PATH}/bin/heka-cat${CMAKE_EXECUTABLE_SUFFIX}")
17 |  set(HEKA_EXPORT_EXE "${PROJECT_PATH}/bin/heka-export${CMAKE_EXECUTABLE_SUFFIX}")
18 | +set(HEKA_S3LIST_EXE "${PROJECT_PATH}/bin/heka-s3list${CMAKE_EXECUTABLE_SUFFIX}")
19 | +set(HEKA_S3CAT_EXE "${PROJECT_PATH}/bin/heka-s3cat${CMAKE_EXECUTABLE_SUFFIX}")
20 | +set(S3CAT_EXE "${PROJECT_PATH}/bin/s3cat${CMAKE_EXECUTABLE_SUFFIX}")
21 |  
22 |  option(INCLUDE_SANDBOX "Include Lua sandbox" on)
23 |  option(INCLUDE_MOZSVC "Include the Mozilla services plugins" on)
24 | @@ -225,6 +227,27 @@ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
25 |  
26 |  install(PROGRAMS "${HEKA_EXPORT_EXE}" DESTINATION bin)
27 |  
28 | +add_custom_target(heka-s3list ALL
29 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-s3list
30 | +DEPENDS hekad
31 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
32 | +
33 | +install(PROGRAMS "${HEKA_S3LIST_EXE}" DESTINATION bin)
34 | +
35 | +add_custom_target(heka-s3cat ALL
36 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-s3cat
37 | +DEPENDS hekad
38 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
39 | +
40 | +install(PROGRAMS "${HEKA_S3CAT_EXE}" DESTINATION bin)
41 | +
42 | +add_custom_target(s3cat ALL
43 | +${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/s3cat
44 | +DEPENDS hekad
45 | +WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
46 | +
47 | +install(PROGRAMS "${S3CAT_EXE}" DESTINATION bin)
48 | +
49 |  add_custom_target(sbmgr ALL
50 |  ${GO_EXECUTABLE} install ${LDFLAGS} github.com/mozilla-services/heka/cmd/heka-sbmgr
51 |  DEPENDS hekad)
52 | -- 
53 | 1.9.4 (Apple Git-50.2)
54 | 
55 | 


--------------------------------------------------------------------------------
/heka/plugins/fx/common.c:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /** @brief Lua cuckoo filter common functions  @file */
 8 | 
 9 | #include "common.h"
10 | 
11 | unsigned clp2(unsigned x)
12 | {
13 |   x = x - 1;
14 |   x = x | (x >> 1);
15 |   x = x | (x >> 2);
16 |   x = x | (x >> 4);
17 |   x = x | (x >> 8);
18 |   x = x | (x >> 16);
19 |   return x + 1;
20 | }
21 | 
22 | 
23 | int nlz(unsigned x)
24 | {
25 |   int n;
26 | 
27 |   if (x == 0) return 32;
28 |   n = 1;
29 |   if ((x >> 16) == 0) {n = n + 16; x = x << 16;}
30 |   if ((x >> 24) == 0) {n = n + 8; x = x << 8;}
31 |   if ((x >> 28) == 0) {n = n + 4; x = x << 4;}
32 |   if ((x >> 30) == 0) {n = n + 2; x = x << 2;}
33 |   n = n - (x >> 31);
34 |   return n;
35 | }
36 | 
37 | 
38 | unsigned fingerprint(unsigned h)
39 | {
40 |   h = h >> 16;
41 |   return h ? h : 1;
42 | }
43 | 


--------------------------------------------------------------------------------
/heka/plugins/fx/common.h:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /** @brief Lua cuckoo filter common functions  @file */
 8 | 
 9 | #ifndef common_h_
10 | #define common_h_
11 | 
12 | #define BUCKET_SIZE 4
13 | 
14 | /**
15 |  * Hacker's Delight - Henry S. Warren, Jr. page 48
16 |  *
17 |  * @param x
18 |  *
19 |  * @return unsigned Least power of 2 greater than or equal to x
20 |  */
21 | unsigned clp2(unsigned x);
22 | 
23 | /**
24 |  * Hacker's Delight - Henry S. Warren, Jr. page 78
25 |  *
26 |  * @param x
27 |  *
28 |  * @return int Number of leading zeros
29 |  */
30 | int nlz(unsigned x);
31 | 
32 | /**
33 |  * Turn the unsigned value into a 16 bit fingerprint
34 |  *
35 |  * @param h
36 |  *
37 |  * @return unsigned
38 |  */
39 | unsigned fingerprint(unsigned h);
40 | #endif
41 | 


--------------------------------------------------------------------------------
/heka/plugins/hash/lua_hash.c:
--------------------------------------------------------------------------------
 1 | /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 2 | /* vim: set ts=2 et sw=2 tw=80: */
 3 | /* This Source Code Form is subject to the terms of the Mozilla Public
 4 |  * License, v. 2.0. If a copy of the MPL was not distributed with this
 5 |  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 6 | 
 7 | /** @brief Lua hash functions @file */
 8 | 
 9 | #include "lauxlib.h"
10 | #include "lua.h"
11 | #include <zlib.h>
12 | 
13 | static int zlib_adler32(lua_State* lua)
14 | {
15 |   size_t len;
16 |   const char* buf;
17 | 
18 |   if (lua_type(lua, 1) == LUA_TSTRING) {
19 |     buf = lua_tolstring(lua, 1, &len);
20 |   } else {
21 |     return luaL_argerror(lua, 1, "must be a string");
22 |   }
23 | 
24 |   uLong adler = adler32(0L, Z_NULL, 0);
25 |   adler = adler32(adler, buf, len);
26 |   lua_pushinteger(lua, adler);
27 | 
28 |   return 1;
29 | }
30 | 
31 | 
32 | static int zlib_crc32(lua_State* lua)
33 | {
34 |   size_t len;
35 |   const char* buf;
36 | 
37 |   if (lua_type(lua, 1) == LUA_TSTRING) {
38 |     buf = lua_tolstring(lua, 1, &len);
39 |   } else {
40 |     return luaL_argerror(lua, 1, "must be a string");
41 |   }
42 | 
43 |   uLong crc = crc32(0L, Z_NULL, 0);
44 |   crc = crc32(crc, buf, len);
45 |   lua_pushinteger(lua, crc);
46 | 
47 |   return 1;
48 | }
49 | 
50 | 
51 | static const struct luaL_reg hashlib_f[] =
52 | {
53 |   { "adler32", zlib_adler32 }
54 |   , { "crc32", zlib_crc32 }
55 |   , { NULL, NULL }
56 | };
57 | 
58 | 
59 | int luaopen_hash(lua_State* lua)
60 | {
61 |   luaL_register(lua, "hash", hashlib_f);
62 |   return 1;
63 | }
64 | 


--------------------------------------------------------------------------------
/heka/plugins/kafkaconsumergroup/kafka_consumer_group_input_test.go:
--------------------------------------------------------------------------------
  1 | /***** BEGIN LICENSE BLOCK *****
  2 | # This Source Code Form is subject to the terms of the Mozilla Public
  3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
  4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
  5 | #
  6 | # The Initial Developer of the Original Code is the Mozilla Foundation.
  7 | # Portions created by the Initial Developer are Copyright (C) 2014-2015
  8 | # the Initial Developer. All Rights Reserved.
  9 | #
 10 | # Contributor(s):
 11 | #   Mike Trinkala (trink@mozilla.com)
 12 | #   Rob Miller (rmiller@mozilla.com)
 13 | #   Wesley Dawson (whd@mozilla.com)
 14 | #
 15 | # ***** END LICENSE BLOCK *****/
 16 | 
 17 | package kafkaconsumergroup
 18 | 
 19 | import (
 20 | 	"testing"
 21 | 
 22 | 	. "github.com/mozilla-services/heka/pipeline"
 23 | )
 24 | 
 25 | func TestEmptyZookeeperConnectionString(t *testing.T) {
 26 | 	pConfig := NewPipelineConfig(nil)
 27 | 	ki := new(KafkaConsumerGroupInput)
 28 | 	ki.SetPipelineConfig(pConfig)
 29 | 	config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig)
 30 | 	config.ConsumerGroup = "test"
 31 | 	config.Topics = []string{"test"}
 32 | 	err := ki.Init(config)
 33 | 
 34 | 	errmsg := "zookeeper_connection_string required"
 35 | 	if err.Error() != errmsg {
 36 | 		t.Errorf("Expected: %s, received: %s", errmsg, err)
 37 | 	}
 38 | }
 39 | 
 40 | func TestBadZookeeperConnectionString(t *testing.T) {
 41 | 	pConfig := NewPipelineConfig(nil)
 42 | 	ki := new(KafkaConsumerGroupInput)
 43 | 	ki.SetPipelineConfig(pConfig)
 44 | 	config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig)
 45 | 	config.ConsumerGroup = "test"
 46 | 	config.Topics = []string{"test"}
 47 | 	config.ZookeeperConnectionString = "::"
 48 | 	err := ki.Init(config)
 49 | 
 50 | 	errmsg := "too many colons in address ::"
 51 | 	if err.Error() != errmsg {
 52 | 		t.Errorf("Expected: %s, received: %s", errmsg, err)
 53 | 	}
 54 | }
 55 | 
 56 | func TestInvalidOffsetMethod(t *testing.T) {
 57 | 	pConfig := NewPipelineConfig(nil)
 58 | 	ki := new(KafkaConsumerGroupInput)
 59 | 	ki.SetName("test")
 60 | 	ki.SetPipelineConfig(pConfig)
 61 | 
 62 | 	config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig)
 63 | 	config.ConsumerGroup = "test"
 64 | 	config.Topics = []string{"test"}
 65 | 	config.ZookeeperConnectionString = "localhost:2181"
 66 | 	config.OffsetMethod = "last"
 67 | 	err := ki.Init(config)
 68 | 
 69 | 	errmsg := "invalid offset_method: last"
 70 | 	if err.Error() != errmsg {
 71 | 		t.Errorf("Expected: %s, received: %s", errmsg, err)
 72 | 	}
 73 | }
 74 | 
 75 | func TestEmptyInputTopics(t *testing.T) {
 76 | 	pConfig := NewPipelineConfig(nil)
 77 | 	ki := new(KafkaConsumerGroupInput)
 78 | 	ki.SetPipelineConfig(pConfig)
 79 | 	config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig)
 80 | 	config.ConsumerGroup = "test"
 81 | 	config.ZookeeperConnectionString = "localhost:2181"
 82 | 	err := ki.Init(config)
 83 | 
 84 | 	errmsg := "topics required"
 85 | 	if err.Error() != errmsg {
 86 | 		t.Errorf("Expected: %s, received: %s", errmsg, err)
 87 | 	}
 88 | }
 89 | 
 90 | func TestMissingConsumerGroup(t *testing.T) {
 91 | 	pConfig := NewPipelineConfig(nil)
 92 | 	ki := new(KafkaConsumerGroupInput)
 93 | 	ki.SetPipelineConfig(pConfig)
 94 | 	config := ki.ConfigStruct().(*KafkaConsumerGroupInputConfig)
 95 | 	config.Topics = []string{"test"}
 96 | 	config.ZookeeperConnectionString = "localhost:2181"
 97 | 	err := ki.Init(config)
 98 | 
 99 | 	errmsg := "consumer_group required"
100 | 	if err.Error() != errmsg {
101 | 		t.Errorf("Expected: %s, received: %s", errmsg, err)
102 | 	}
103 | }
104 | 


--------------------------------------------------------------------------------
/heka/plugins/s3splitfile/all_specs_test.go:
--------------------------------------------------------------------------------
 1 | /***** BEGIN LICENSE BLOCK *****
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
 5 | # ***** END LICENSE BLOCK *****/
 6 | 
 7 | package s3splitfile
 8 | 
 9 | import (
10 | 	"github.com/rafrombrc/gospec/src/gospec"
11 | 	"testing"
12 | )
13 | 
14 | func TestAllSpecs(t *testing.T) {
15 | 	r := gospec.NewRunner()
16 | 	r.Parallel = false
17 | 
18 | 	r.AddSpec(S3SplitFileSpec)
19 | 
20 | 	gospec.MainGoTest(r, t)
21 | }
22 | 


--------------------------------------------------------------------------------
/heka/plugins/s3splitfile/s3splitfile_common_test.go:
--------------------------------------------------------------------------------
  1 | /***** BEGIN LICENSE BLOCK *****
  2 | # This Source Code Form is subject to the terms of the Mozilla Public
  3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
  4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
  5 | # ***** END LICENSE BLOCK *****/
  6 | 
  7 | package s3splitfile
  8 | 
  9 | import (
 10 | 	"github.com/mozilla-services/heka/message"
 11 | 	. "github.com/mozilla-services/heka/pipeline"
 12 | 	gs "github.com/rafrombrc/gospec/src/gospec"
 13 | 	"path/filepath"
 14 | )
 15 | 
 16 | func testFieldVal(c gs.Context, schema Schema, field string, actual string, expected string) {
 17 | 	sVal, err := schema.GetValue(field, actual)
 18 | 	c.Expect(err, gs.IsNil)
 19 | 	c.Expect(sVal, gs.Equals, expected)
 20 | }
 21 | 
 22 | func S3SplitFileSpec(c gs.Context) {
 23 | 	c.Specify("Sanitize dimensions", func() {
 24 | 		c.Expect("hello_there", gs.Equals, SanitizeDimension("hello!there"))
 25 | 
 26 | 		c.Expect("___________________________", gs.Equals, SanitizeDimension("!@#$%^&*(){}[]|+=-`~'\",<>?\x02"))
 27 | 	})
 28 | 
 29 | 	c.Specify("JSON Schema", func() {
 30 | 		schema, err := LoadSchema(filepath.Join(".", "testsupport", "schema.json"))
 31 | 		c.Expect(err, gs.IsNil)
 32 | 
 33 | 		c.Expect(len(schema.Fields), gs.Equals, 5)
 34 | 
 35 | 		// Bogus field:
 36 | 		_, err = schema.GetValue("bogus", "some value")
 37 | 		c.Expect(err, gs.Not(gs.IsNil))
 38 | 
 39 | 		testFieldVal(c, schema, "any", "foo", "foo")
 40 | 		testFieldVal(c, schema, "any", "Any value at all is acceptable!", "Any value at all is acceptable!")
 41 | 
 42 | 		testFieldVal(c, schema, "list", "foo", "foo")
 43 | 		testFieldVal(c, schema, "list", "bar", "bar")
 44 | 		testFieldVal(c, schema, "list", "baz", "baz")
 45 | 		testFieldVal(c, schema, "list", "quux", "OTHER")
 46 | 		testFieldVal(c, schema, "list", "Some values are not acceptable!", "OTHER")
 47 | 
 48 | 		testFieldVal(c, schema, "rangeMin", "aaa", "aaa")
 49 | 		testFieldVal(c, schema, "rangeMin", "foo", "foo")
 50 | 		testFieldVal(c, schema, "rangeMin", "bar", "bar")
 51 | 		testFieldVal(c, schema, "rangeMin", "all values larger than 'aaa' are fine!", "all values larger than 'aaa' are fine!")
 52 | 		testFieldVal(c, schema, "rangeMin", "100", "OTHER")
 53 | 
 54 | 		testFieldVal(c, schema, "rangeMax", "all", "all")
 55 | 		testFieldVal(c, schema, "rangeMax", "bar", "bar")
 56 | 		testFieldVal(c, schema, "rangeMax", "bbb", "bbb")
 57 | 		testFieldVal(c, schema, "rangeMax", "all values smaller than 'bbb' are fine!", "all values smaller than 'bbb' are fine!")
 58 | 		testFieldVal(c, schema, "rangeMax", "100", "100")
 59 | 		testFieldVal(c, schema, "rangeMax", "ccc", "OTHER")
 60 | 
 61 | 		testFieldVal(c, schema, "range", "aaa", "aaa")
 62 | 		testFieldVal(c, schema, "range", "all", "all")
 63 | 		testFieldVal(c, schema, "range", "bar", "bar")
 64 | 		testFieldVal(c, schema, "range", "bbb", "bbb")
 65 | 		testFieldVal(c, schema, "range", "all values between 'aaa' and 'bbb' are fine!", "all values between 'aaa' and 'bbb' are fine!")
 66 | 		testFieldVal(c, schema, "range", "100", "OTHER")
 67 | 		testFieldVal(c, schema, "range", "aa0", "OTHER")
 68 | 		testFieldVal(c, schema, "range", "bbc", "OTHER")
 69 | 		testFieldVal(c, schema, "range", "ccc", "OTHER")
 70 | 	})
 71 | 
 72 | 	c.Specify("Non-string fields", func() {
 73 | 		schema, _ := LoadSchema(filepath.Join(".", "testsupport", "schema.json"))
 74 | 		pack := NewPipelinePack(nil)
 75 | 
 76 | 		// No fields
 77 | 		dims := schema.GetDimensions(pack)
 78 | 		c.Expect(dims[0], gs.Equals, "UNKNOWN")
 79 | 
 80 | 		// Integer field
 81 | 		f, _ := message.NewField("any", 1, "")
 82 | 		pack.Message.AddField(f)
 83 | 		dims = schema.GetDimensions(pack)
 84 | 		c.Expect(dims[0], gs.Equals, "1")
 85 | 		pack.Message.DeleteField(f)
 86 | 
 87 | 		// Boolean field
 88 | 		f, _ = message.NewField("any", true, "")
 89 | 		pack.Message.AddField(f)
 90 | 		dims = schema.GetDimensions(pack)
 91 | 		c.Expect(dims[0], gs.Equals, "true")
 92 | 		pack.Message.DeleteField(f)
 93 | 
 94 | 		// Double field
 95 | 		f, _ = message.NewField("any", 1.23, "")
 96 | 		pack.Message.AddField(f)
 97 | 		dims = schema.GetDimensions(pack)
 98 | 		c.Expect(dims[0], gs.Equals, "1.23")
 99 | 		pack.Message.DeleteField(f)
100 | 
101 | 		// Empty string field
102 | 		f, _ = message.NewField("any", "", "")
103 | 		pack.Message.AddField(f)
104 | 		dims = schema.GetDimensions(pack)
105 | 		c.Expect(dims[0], gs.Equals, "UNKNOWN")
106 | 		pack.Message.DeleteField(f)
107 | 
108 | 	})
109 | }
110 | 


--------------------------------------------------------------------------------
/heka/plugins/s3splitfile/testsupport/schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     { "field_name": "any",      "allowed_values": "*" },
 5 |     { "field_name": "list",     "allowed_values": ["foo", "bar", "baz"] },
 6 |     { "field_name": "rangeMin", "allowed_values": { "min": "aaa" } },
 7 |     { "field_name": "rangeMax", "allowed_values": { "max": "bbb" } },
 8 |     { "field_name": "range",    "allowed_values": { "min": "aaa", "max": "bbb" } }
 9 |   ]
10 | }
11 | 
12 | 


--------------------------------------------------------------------------------
/heka/plugins/snap/snappy_decoder.go:
--------------------------------------------------------------------------------
 1 | /***** BEGIN LICENSE BLOCK *****
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
 5 | # ***** END LICENSE BLOCK *****/
 6 | 
 7 | package snap
 8 | 
 9 | import (
10 | 	"github.com/golang/snappy"
11 | 	. "github.com/mozilla-services/heka/pipeline"
12 | )
13 | 
14 | // SnappyDecoder decompresses snappy-compressed Message bytes.
15 | type SnappyDecoder struct {
16 | }
17 | 
18 | func (re *SnappyDecoder) Init(config interface{}) (err error) {
19 | 	return
20 | }
21 | 
22 | func (re *SnappyDecoder) Decode(pack *PipelinePack) (packs []*PipelinePack, err error) {
23 | 	output, decodeErr := snappy.Decode(nil, pack.MsgBytes)
24 | 
25 | 	packs = []*PipelinePack{pack}
26 | 	if decodeErr == nil {
27 | 		// Replace bytes with decoded data
28 | 		pack.MsgBytes = output
29 | 	}
30 | 	// If there is an error decoding snappy, maybe it wasn't compressed. We'll
31 | 	// return the original data and try to proceed.
32 | 	return
33 | }
34 | 
35 | func init() {
36 | 	RegisterPlugin("SnappyDecoder", func() interface{} {
37 | 		return new(SnappyDecoder)
38 | 	})
39 | }
40 | 


--------------------------------------------------------------------------------
/heka/plugins/snap/snappy_encoder.go:
--------------------------------------------------------------------------------
 1 | /***** BEGIN LICENSE BLOCK *****
 2 | # This Source Code Form is subject to the terms of the Mozilla Public
 3 | # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 4 | # You can obtain one at http://mozilla.org/MPL/2.0/.
 5 | # ***** END LICENSE BLOCK *****/
 6 | 
 7 | package snap
 8 | 
 9 | import (
10 | 	"github.com/golang/snappy"
11 | 	. "github.com/mozilla-services/heka/pipeline"
12 | )
13 | 
14 | // SnappyEncoder compresses the Message bytes using snappy compression. Each
15 | // message is compressed separately.
16 | type SnappyEncoder struct {
17 | }
18 | 
19 | func (re *SnappyEncoder) Init(config interface{}) (err error) {
20 | 	return
21 | }
22 | 
23 | func (re *SnappyEncoder) Encode(pack *PipelinePack) (output []byte, err error) {
24 | 	output = snappy.Encode(nil, pack.MsgBytes)
25 | 	return output, nil
26 | }
27 | 
28 | func init() {
29 | 	RegisterPlugin("SnappyEncoder", func() interface{} {
30 | 		return new(SnappyEncoder)
31 | 	})
32 | }
33 | 


--------------------------------------------------------------------------------
/heka/sandbox/decoders/decompress_payload.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "string"
 6 | local gzip = require "gzip"
 7 | 
 8 | function process_message()
 9 |     local payload = read_message("Payload")
10 |     local b1, b2 = string.byte(payload, 1, 2)
11 | 
12 |     if b1 == 0x1f and b2 == 0x8b then  -- test for gzip magic header bytes
13 |         local ok, result = pcall(gzip.decompress, payload)
14 |         if not ok then
15 |             return -1, result
16 |         end
17 |         write_message("Payload", result)
18 |     end
19 | 
20 |     return 0
21 | end
22 | 


--------------------------------------------------------------------------------
/heka/sandbox/decoders/extract_fhr_dimensions.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | The decoder extracts the FHR partition dimensions from the large JSON payload
  7 | and adds them as message fields to avoid additional down stream parsing; it also
  8 | uses an IP address lookup to determine the submission's country of origin and
  9 | adds it a as a message field.
 10 | 
 11 | Config:
 12 | 
 13 | - geoip_city_db (string)
 14 |     The fully qualified path to the GeoIP city database (if not in the default
 15 |     location).
 16 | 
 17 | *Example Heka Configuration*
 18 | 
 19 | .. code-block:: ini
 20 | 
 21 |     [FHRDecoder]
 22 |     type = "SandboxDecoder"
 23 |     filename = "extract_fhr_dimensions.lua"
 24 |     memory_limit = 30000000
 25 |     output_limit = 2097152
 26 | 
 27 |         # Default
 28 |         # [FHRDecoder.config]
 29 |         # geoip_city_db = "/usr/local/share/GeoIP/GeoIPCity.dat"
 30 | 
 31 | *Example Heka Message*
 32 | 
 33 | :Timestamp: 2014-07-19 17:23:35.060999936 +0000 UTC
 34 | :Type: fhr_metadata
 35 | :Hostname: ip-10-227-137-43
 36 | :Pid: 0
 37 | :Uuid: 2dfcbeb8-18d4-41b8-af50-aa055fd94831
 38 | :Logger: fhr
 39 | :Payload: {...}
 40 | :EnvVersion:
 41 | :Severity: 7
 42 | :Fields:
 43 |     | name:"submissionDate" type:string value:"20140719"
 44 |     | name:"appVersion" type:string value:"30.0"
 45 |     | name:"appUpdateChannel" type:string value:"release"
 46 |     | name:"sourceVersion" type:string value:"2"
 47 |     | name:"clientID" type:string value:"a6d35999-2d8d-4c68-9c6b-fbe8c514e40e"
 48 |     | name:"os" type:string value:"Linux"
 49 |     | name:"geoCountry" type:string value:"GB"
 50 |     | name:"sourceName" type:string value:"fhr"
 51 |     | name:"vendor" type:string value:"Mozilla"
 52 |     | name:"appBuildID" type:string value:"20140608211622"
 53 |     | name:"appName" type:string value:"Firefox"
 54 | --]]
 55 | 
 56 | require "cjson"
 57 | require 'geoip.city'
 58 | require "os"
 59 | 
 60 | local city_db = assert(geoip.city.open(read_config("geoip_city_db")))
 61 | 
 62 | local msg = {
 63 | Timestamp   = nil,
 64 | Type        = "fhr_metadata",
 65 | Payload     = nil,
 66 | Fields      = { sourceName = "fhr" }
 67 | }
 68 | 
 69 | local UNK_DIM = "UNKNOWN"
 70 | local UNK_GEO = "??"
 71 | 
 72 | function process_message()
 73 |     -- Carry forward payload
 74 |     msg.Payload = read_message("Payload")
 75 | 
 76 |     local ok, fhr = pcall(cjson.decode, msg.Payload)
 77 |     if not ok then return -1, fhr end
 78 | 
 79 |     msg.Fields.sourceVersion    = tostring(fhr.version) or UNK_DIM
 80 | 
 81 |     local info
 82 |     if msg.Fields.sourceVersion == "1" then
 83 |         if type(fhr.data) ~= "table" then
 84 |             return -1, "missing object: data"
 85 |         end
 86 |         if type(fhr.data.last) ~= "table" then
 87 |             return -1, "missing object: data.last"
 88 |         end
 89 |         if type(fhr.data.last["org.mozilla.appInfo.appinfo"]) == "table" then
 90 |             info = fhr.data.last["org.mozilla.appInfo.appinfo"]
 91 |         elseif type(fhr.data.last["org.mozilla.appInfo.appinfo.1"]) == "table" then
 92 |             info = fhr.data.last["org.mozilla.appInfo.appinfo.1"]
 93 |         else
 94 |             return -1, "missing object: data.last[org.mozilla.appInfo.appinfo]"
 95 |         end
 96 |     elseif msg.Fields.sourceVersion == "2" then
 97 |         if type(fhr.geckoAppInfo) ~= "table" then
 98 |             return -1, "missing object: geckoAppInfo"
 99 |         end
100 |         info = fhr.geckoAppInfo
101 |     elseif msg.Fields.sourceVersion == "3" then
102 |         -- Use v3 structure.
103 |         if type(fhr.environments) ~= "table" then
104 |             return -1, "missing object: environments"
105 |         end
106 |         if type(fhr.environments.current) ~= "table" then
107 |             return -1, "missing object: environments.current"
108 |         end
109 |         if type(fhr.environments.current.geckoAppInfo) ~= "table" then
110 |             return -1, "missing object: environments.current.geckoAppInfo"
111 |         end
112 |         info = fhr.environments.current.geckoAppInfo
113 |     else
114 |         return -1, "unknown payload version"
115 |     end
116 | 
117 |     -- Get some more dimensions
118 |     msg.Fields.appName          = info.name or UNK_DIM
119 |     msg.Fields.appVersion       = info.version or UNK_DIM
120 |     msg.Fields.appUpdateChannel = info.updateChannel or UNK_DIM
121 | 
122 |     -- Do not want default values for these.
123 |     msg.Fields.appBuildID       = info.appBuildID
124 |     msg.Fields.os               = info.os
125 |     msg.Fields.vendor           = info.vendor
126 |     msg.Fields.clientID         = fhr.clientID
127 | 
128 |     -- IP address lookup
129 |     msg.Fields.geoCountry = city_db:query_by_addr(read_message("Fields[remote_addr]"), "country_code") or UNK_GEO
130 | 
131 |     -- Carry forward timestamp.
132 |     msg.Timestamp = read_message("Timestamp")
133 | 
134 |     msg.Fields.submissionDate = os.date("%Y%m%d", msg.Timestamp / 1e9)
135 | 
136 |     -- Send new message along
137 |     inject_message(msg)
138 | 
139 |     return 0
140 | end
141 | 


--------------------------------------------------------------------------------
/heka/sandbox/decoders/extract_tls_info.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Extract clock skew, issuer info and Subject / SAN match status from tls error
  7 | reports. This decoder MUST NOT return failure due to the way the Heka
  8 | MultiDecoder is implemented.
  9 | --]]
 10 | 
 11 | require "string"
 12 | require "cjson"
 13 | require "os"
 14 | 
 15 | local openssl = require "openssl"
 16 | local name = openssl.x509.name
 17 | local asn1 = openssl.asn1
 18 | 
 19 | local certPrefix = "-----BEGIN CERTIFICATE-----\n"
 20 | local certSuffix = "-----END CERTIFICATE-----\n"
 21 | 
 22 | local msg = {
 23 |   Type = "tls_report",
 24 |   Fields = {}
 25 | }
 26 | 
 27 | -- create PEM data from base64 encoded DER
 28 | local function make_pem(data)
 29 |   local pem = certPrefix
 30 |   local offset = 1
 31 |   while offset <= data:len() do
 32 |     local stop = offset + 63
 33 |     if stop > data:len() then
 34 |       stop = data:len()
 35 |     end
 36 |     pem = pem .. data:sub(offset, stop) .. "\n"
 37 |     offset = stop + 1
 38 |   end
 39 |   return pem .. certSuffix
 40 | end
 41 | 
 42 | -- read and parse a certificate
 43 | local function read_cert(data)
 44 |   local pem = make_pem(data)
 45 |   return pcall(openssl.x509.read, pem)
 46 | end
 47 | 
 48 | local function parse_cert(cert)
 49 |   return pcall(cert.parse, cert)
 50 | end
 51 | 
 52 | local duplicate_original = read_config("duplicate_original")
 53 | 
 54 | function process_message()
 55 |     if duplicate_original then
 56 |         inject_message(read_message("raw"))
 57 |     end
 58 | 
 59 |     msg.Fields["submissionDate"] = read_message("Fields[submissionDate]")
 60 | 
 61 |     local payload = read_message("Fields[submission]")
 62 |     local ok, report = pcall(cjson.decode, payload)
 63 |     if not ok then return -1, report end
 64 | 
 65 |     -- copy over the expected fields
 66 |     local expected = {
 67 |       "hostname",
 68 |       "port",
 69 |       "timestamp",
 70 |       "errorCode",
 71 |       "failedCertChain",
 72 |       "userAgent",
 73 |       "version",
 74 |       "build",
 75 |       "product",
 76 |       "channel"
 77 |     }
 78 | 
 79 |     for i, fieldname in ipairs(expected) do
 80 |       local field = report[fieldname]
 81 |       -- ensure the field is not empty (and does not contain an empty table)
 82 |       if not ("table" == type(field) and next(field) == nil) then
 83 |         msg.Fields[fieldname] = field
 84 |       end
 85 |     end
 86 | 
 87 |     -- calculate the clock skew - in seconds, since os.time() returns those
 88 |     local reportTime = report["timestamp"]
 89 |     if "number" == type(reportTime) then
 90 |       -- skew will be positive if the remote timestamp is in the future
 91 |       local skew = reportTime - os.time()
 92 | 
 93 |       msg.Fields["skew"] = skew
 94 |     end
 95 | 
 96 |     -- extract the rootmost and end entity certificates
 97 |     local failedCertChain = report["failedCertChain"]
 98 |     local ee = nil
 99 |     local rootMost = nil
100 |     if "table" == type(failedCertChain) then
101 |       for i, cert in ipairs(failedCertChain) do
102 |         if not ee then
103 |           ee = cert
104 |         end
105 |         rootMost = cert
106 |       end
107 |     end
108 | 
109 |     -- get the issuer name from the root-most certificate
110 |     if rootMost then
111 |       local parsed = nil
112 |       local ok, cert = read_cert(rootMost);
113 |       if ok and cert then
114 |         ok, parsed = parse_cert(cert)
115 |       end
116 |       if ok and parsed then
117 |         local issuer = parsed["issuer"]
118 |         if issuer then
119 |           msg.Fields["rootIssuer"] = issuer:get_text("CN")
120 |         end
121 |       end
122 |     end
123 | 
124 |     -- determine if the end entity subject or SAN matches the hostname
125 |     local hostname = report["hostname"]
126 |     if ee and hostname then
127 |       local ok, cert = read_cert(ee);
128 |       if ok and cert then
129 |         local ok, matches = pcall(cert.check_host, cert, hostname)
130 |         if ok and matches then
131 |           msg.Fields["hostnameMatch"] = matches
132 |         end
133 |       end
134 |     end
135 | 
136 |     inject_message(msg)
137 |     return 0
138 | end
139 | 


--------------------------------------------------------------------------------
/heka/sandbox/encoders/combine_telemetry_objects.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "cjson"
 6 | local l = require "lpeg"
 7 | 
 8 | local grammar = (l.C"payload" + l.C"environment") * l.P"." * l.C(l.P(1)^1)
 9 | 
10 | function process_message()
11 |     local raw = read_message("raw")
12 |     local ok, msg = pcall(decode_message, raw)
13 |     if not ok then return -1, msg end
14 | 
15 |     if type(msg.Fields) ~= "table" then return -1, "missing Fields" end
16 | 
17 |     local meta = {
18 |         Timestamp = msg.Timestamp / 1e9,
19 |         Type = msg.Type,
20 |         Hostname = msg.Hostname,
21 |     }
22 | 
23 |     local ok, json = pcall(cjson.decode, read_message("Payload"))
24 |     if not ok then return -1, json end
25 | 
26 |     for i=1, #msg.Fields do
27 |         local section, name = grammar:match(msg.Fields[i].name)
28 |         if section then
29 |             local ok, object = pcall(cjson.decode, msg.Fields[i].value[1])
30 |             if ok then
31 |                 json[section][name] = object
32 |             end
33 |         else
34 |             meta[msg.Fields[i].name] = msg.Fields[i].value[1]
35 |         end
36 |     end
37 | 
38 |     local ok, jmeta = pcall(cjson.encode, meta)
39 |     if not ok then return -1, jmeta end
40 |     local ok, payload = pcall(cjson.encode, json)
41 |     if not ok then return -1, payload end
42 | 
43 |     inject_payload("txt", "output", json.clientId, "\t[", jmeta, ",", payload, "]\n")
44 |     return 0
45 | end
46 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/count_by_normalized_channel.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Request Counts by Normalized Channel
 7 | 
 8 | *Example Heka Configuration*
 9 | 
10 | .. code-block:: ini
11 | 
12 |     [CountByNormalizedChannel]
13 |     type = "SandboxFilter"
14 |     filename = "lua_filters/count_by_normalized_channel.lua"
15 |     message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'"
16 |     ticker_interval = 60
17 |     preserve_data = true
18 | 
19 | --]]
20 | 
21 | require "circular_buffer"
22 | fx = require "fx"
23 | 
24 | local rows = read_config("rows") or 1440
25 | local sec_per_row = read_config("sec_per_row") or 60
26 | 
27 | local nchannels = fx.get_channel_count()
28 | local channel_counter = circular_buffer.new(rows, nchannels, sec_per_row, true)
29 | for i=1,nchannels do
30 |     -- Circular buffer columns are one-based, channel ids are zero-based.
31 |     channel_counter:set_header(i, fx.get_channel_name(i - 1))
32 | end
33 | 
34 | function process_message()
35 |     local ts = read_message("Timestamp")
36 |     local normalized = fx.normalize_channel(read_message("Fields[channel]"))
37 | 
38 |     -- Need to add one to account for "Other" (which comes back as zero)
39 |     local column_id = fx.get_channel_id(normalized) + 1
40 |     channel_counter:add(ts, column_id, 1)
41 |     return 0
42 | end
43 | 
44 | local title = "Counts by Normalized Channel"
45 | function timer_event(ns)
46 |     inject_payload("cbuf", title, channel_counter:format("cbuf"))
47 |     inject_payload("cbufd", title, channel_counter:format("cbufd"))
48 | end
49 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/fhr_requests.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | FHR Request Counts
 7 | 
 8 | *Example Heka Configuration*
 9 | 
10 | .. code-block:: ini
11 | 
12 |     [FHRRequestCount]
13 |     type = "SandboxFilter"
14 |     filename = "lua_filters/fhr_requests.lua"
15 |     message_matcher = "Logger == 'fx' && Type == 'executive_summary'"
16 |     ticker_interval = 60
17 |     preserve_data = true
18 | 
19 | --]]
20 | _PRESERVATION_VERSION = 1
21 | 
22 | require "circular_buffer"
23 | local alert         = require "alert"
24 | local annotation    = require "annotation"
25 | local anomaly       = require "anomaly"
26 | 
27 | local title             = "FHR Requests"
28 | local rows              = read_config("rows") or 14400
29 | local sec_per_row       = read_config("sec_per_row") or 60
30 | local anomaly_config    = anomaly.parse_config(read_config("anomaly_config"))
31 | annotation.set_prune(title, rows * sec_per_row * 1e9)
32 | 
33 | cbuf = circular_buffer.new(rows, 1, sec_per_row, true)
34 | cbuf:set_header(1, "Requests")
35 | 
36 | function process_message ()
37 |     cbuf:add(read_message("Timestamp"), 1, 1)
38 |     return 0
39 | end
40 | 
41 | function timer_event(ns)
42 |     if anomaly_config then
43 |         if not alert.throttled(ns) then
44 |             local msg, annos = anomaly.detect(ns, title, cbuf, anomaly_config)
45 |             if msg then
46 |                 annotation.concat(title, annos)
47 |                 alert.send(ns, msg)
48 |             end
49 |         end
50 |         inject_payload("cbuf", title, annotation.prune(title, ns), cbuf:format("cbuf"))
51 |     else
52 |         inject_payload("cbuf", title, cbuf:format("cbuf"))
53 |     end
54 |     inject_payload("cbufd", title, cbuf:format("cbufd"))
55 | end
56 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/firefox_active_instances.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Firefox Active Instances
  7 | 
  8 | *Example Heka Configuration*
  9 | 
 10 | .. code-block:: ini
 11 | 
 12 |     [FirefoxActiveInstances]
 13 |     type = "SandboxFilter"
 14 |     filename = "lua_filters/firefox_active_instances.lua"
 15 |     message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'"
 16 |     ticker_interval = 60
 17 |     preserve_data = true
 18 | --]]
 19 | require "circular_buffer"
 20 | require "cjson"
 21 | require "math"
 22 | require "os"
 23 | require "hyperloglog"
 24 | 
 25 | local DAYS = 30
 26 | local SEC_IN_DAY = 60 * 60 * 24
 27 | local floor = math.floor
 28 | local date = os.date
 29 | 
 30 | day_cb  = circular_buffer.new(DAYS, 1, SEC_IN_DAY, true)
 31 | day_cb:set_header(1, "Active Instances")
 32 | day_hll = {}
 33 | for i=1,DAYS do
 34 |     day_hll[i] = hyperloglog.new()
 35 | end
 36 | current_day = -1
 37 | 
 38 | local month_names = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug",
 39 |     "Sep", "Oct", "Nov", "Dec"}
 40 | local MONTHS = #month_names
 41 | month_hll = {}
 42 | for i=1,MONTHS do
 43 |     month_hll[i] = hyperloglog.new()
 44 | end
 45 | current_month = -1
 46 | 
 47 | local function clear_days(s, e)
 48 |     for i = s + 1, e do
 49 |         local idx = i % DAYS + 1
 50 |         day_hll[idx]:clear()
 51 |         day_cb:set(i * SEC_IN_DAY * 1e9, 1, 0)
 52 |     end
 53 | end
 54 | 
 55 | local function update_day(ts, cid, day)
 56 |     if current_day == -1 then current_day = day end
 57 | 
 58 |     local delta = day - current_day
 59 |     if delta > 0 and delta < DAYS then
 60 |         clear_days(current_day, day)
 61 |         current_day = day
 62 |     elseif delta >= DAYS then
 63 |         clear_days(current_day, current_day + DAYS)
 64 |         current_day = current_day + delta
 65 |     elseif delta <= -DAYS then
 66 |         return -- ignore data in the past
 67 |     end
 68 |     local idx = day % DAYS + 1
 69 |     if day_hll[idx]:add(cid) then
 70 |         day_cb:set(ts, 1, day_hll[idx]:count())
 71 |     end
 72 | end
 73 | 
 74 | local function clear_months(s, n)
 75 |     for i = 1, n do
 76 |         s = s + 1
 77 |         if s > MONTHS then s = 1 end
 78 |         month_hll[s]:clear()
 79 |     end
 80 | end
 81 | 
 82 | local function update_month(ts, cid, day_changed, day_advanced)
 83 |     local month = current_month
 84 |     if current_month == -1 or day_changed then
 85 |         local t = date("*t", ts / 1e9)
 86 |         month = tonumber(t.month)
 87 |         if current_month == -1 then current_month = month end
 88 |     end
 89 | 
 90 |     if day_advanced then
 91 |         local delta = month - current_month
 92 |         if delta > 0 then
 93 |             clear_months(current_month, delta)
 94 |             current_month = month
 95 |         elseif delta < 0 then -- roll over the year
 96 |             clear_months(current_month, MONTHS + delta)
 97 |             current_month = month
 98 |         end
 99 |     end
100 | 
101 |     month_hll[month]:add(cid)
102 | end
103 | 
104 | ----
105 | 
106 | function process_message()
107 |     local ts  = read_message("Timestamp")
108 |     local cid = read_message("Fields[clientId]")
109 |     if type(cid) == "string" then
110 |         local day = floor(ts / (SEC_IN_DAY * 1e9))
111 |         local day_changed = day ~= current_day
112 |         local day_advanced = day > current_day
113 |         update_day(ts, cid, day)
114 |         update_month(ts, cid, day_changed, day_advanced)
115 |     end
116 |     return 0
117 | end
118 | 
119 | local title = "Firefox Active Daily Instances"
120 | function timer_event(ns)
121 |     inject_payload("cbuf", title, day_cb:format("cbuf"))
122 |     inject_payload("cbufd", title, day_cb:format("cbufd"))
123 | 
124 |     local json = {}
125 |     local idx = current_month
126 |     if idx == -1 then idx = 0 end
127 | 
128 |     for i=1,MONTHS do
129 |         idx = idx + 1
130 |         if idx > MONTHS then idx = 1 end
131 |         json[i] = {[month_names[idx]] = month_hll[idx]:count()}
132 |     end
133 |     inject_payload("json", "Firefox Active Monthly Instances", cjson.encode(json))
134 | end
135 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/firefox_channel_switching.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Firefox Channel Switching
  7 | 
  8 | *Example Heka Configuration*
  9 | 
 10 | .. code-block:: ini
 11 | 
 12 |     [FirefoxChannelSwitching]
 13 |     type = "SandboxFilter"
 14 |     filename = "lua_filters/firefox_channel_switching.lua"
 15 |     message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'"
 16 |     memory_limit = 1000000000
 17 |     ticker_interval = 60
 18 |     preserve_data = true
 19 | 
 20 |         [FirefoxChannelSwitching.config]
 21 |         anomaly_config = 'mww_nonparametric("nightly", 3, 3, 4, 0.6) mww_nonparametric("beta", 3, 3, 4, 0.6)'
 22 | --]]
 23 | _PRESERVATION_VERSION = 1
 24 | 
 25 | local fx = require "fx"
 26 | require "circular_buffer"
 27 | require "cuckoo_filter"
 28 | local l = require "lpeg"
 29 | require "string"
 30 | 
 31 | local alert             = require "alert"
 32 | local annotation        = require "annotation"
 33 | local anomaly           = require "anomaly"
 34 | local anomaly_config    = anomaly.parse_config(read_config("anomaly_config"))
 35 | 
 36 | local rows        = read_config("rows") or 180
 37 | local sec_per_row = read_config("sec_per_row") or 60*60*24
 38 | local COL_NEW     = 1
 39 | local COL_IN      = 2
 40 | local COL_OUT     = 3
 41 | 
 42 | local function create_cbuf()
 43 |     local cb = circular_buffer.new(rows, COL_OUT, sec_per_row, true)
 44 |     cb:set_header(COL_NEW   , "new")
 45 |     cb:set_header(COL_IN    , "switched in")
 46 |     cb:set_header(COL_OUT   , "switched out")
 47 |     return cb
 48 | end
 49 | 
 50 | channels = {
 51 |     {name = "release"        , cb = create_cbuf(), cf = cuckoo_filter.new(100e6)},
 52 |     {name = "beta"           , cb = create_cbuf(), cf = cuckoo_filter.new(10e6)},
 53 |     {name = "nightly"        , cb = create_cbuf(), cf = cuckoo_filter.new(1e6)},
 54 |     -- aurora uses a different profile so we do not expect to see any switches
 55 |     {name = "aurora", cb = create_cbuf(), cf = cuckoo_filter.new(1e6)},
 56 |     {name = "Other"          , cb = create_cbuf(), cf = cuckoo_filter.new(100e6)},
 57 | }
 58 | local CHANNELS_SIZE = #channels
 59 | 
 60 | function process_message()
 61 |     local cid = read_message("Fields[clientId]")
 62 |     if not cid then return -1, "missing clientId" end
 63 | 
 64 |     local chan = read_message("Fields[channel]")
 65 |     if not chan then return -1, "missing channel" end
 66 | 
 67 |     chan = fx.normalize_channel(chan)
 68 | 
 69 |     local ts = read_message("Timestamp")
 70 |     local matched, added, deleted = nil, false, false
 71 |     for i=1, CHANNELS_SIZE do
 72 |         local v = channels[i]
 73 |         if v.name == chan then
 74 |             added = v.cf:add(cid)
 75 |             matched = v
 76 |         else
 77 |             if v.cf:delete(cid) then
 78 |                 v.cb:add(ts, COL_OUT, 1)
 79 |                 deleted = true
 80 |             end
 81 |         end
 82 | 
 83 |     end
 84 | 
 85 |     if added then
 86 |         if deleted then
 87 |             matched.cb:add(ts, COL_IN, 1)
 88 |         else
 89 |             matched.cb:add(ts, COL_NEW, 1)
 90 |         end
 91 |     end
 92 | 
 93 |     return 0
 94 | end
 95 | 
 96 | function timer_event(ns)
 97 |     for i,v in ipairs(channels) do
 98 |         if anomaly_config then
 99 |             if not alert.throttled(ns) then
100 |                 local msg, annos = anomaly.detect(ns, v.name, v.cb, anomaly_config)
101 |                 if msg then
102 |                     alert.queue(ns, msg)
103 |                     annotation.concat(v.name, annos)
104 |                 end
105 |             end
106 |             local a = annotation.prune(v.name, ns)
107 |             if a then
108 |                 inject_payload("cbuf", v.name, a, v.cb:format("cbuf"))
109 |             else
110 |                 inject_payload("cbuf", v.name, v.cb:format("cbuf"))
111 |             end
112 |         else
113 |             inject_payload("cbuf", v.name, v.cb:format("cbuf"))
114 |         end
115 |         inject_payload("cbufd", v.name, v.cb:format("cbufd"))
116 |     end
117 |     alert.send_queue(ns)
118 | end
119 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/firefox_duplicates.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Firefox Duplicate Telemetry Submission Report
 7 | 
 8 | *Example Heka Configuration*
 9 | 
10 | .. code-block:: ini
11 | 
12 |     [FirefoxDuplicates]
13 |     type = "SandboxFilter"
14 |     filename = "lua_filters/firefox_duplicates.lua"
15 |     message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'"
16 |     output_limit = 0
17 |     memory_limit = 0
18 |     instruction_limit = 0
19 |     ticker_interval = 0
20 |     preserve_data = false
21 |     timer_event_on_shutdown = true
22 | 
23 |         [FirefoxDuplicates.config]
24 |         items = 100000000
25 | --]]
26 | 
27 | require "bloom_filter"
28 | require "circular_buffer"
29 | local fx = require "fx"
30 | 
31 | local items = read_config("items") or 1000000
32 | local probability = read_config("probability") or 0.01
33 | bf = bloom_filter.new(items, probability)
34 | 
35 | local cols = fx.get_channel_count()
36 | cb  = circular_buffer.new(180, cols, 60*60*24, true)
37 | for i=1, cols do
38 |     cb:set_header(i, fx.get_channel_name(i-1))
39 | end
40 | 
41 | cids = {}
42 | 
43 | function process_message()
44 |     local did = read_message("Fields[documentId]")
45 |     if type(did) == "string" then
46 |         if not bf:add(did) then
47 |             local ts  = read_message("Timestamp")
48 |             local channel = read_message("Fields[channel]")
49 |             cb:add(ts, fx.get_channel_id(channel) + 1, 1)
50 | 
51 |             local cid = read_message("Fields[clientId]")
52 |             if type(cid) == "string" then
53 |                 cids[cid] = true
54 |             end
55 |         end
56 |     end
57 |     return 0
58 | end
59 | 
60 | local title = "graph"
61 | function timer_event(ns)
62 |     inject_payload("cbuf", title, cb:format("cbuf"))
63 |     inject_payload("cbufd", title, cb:format("cbufd"))
64 | 
65 |     local found = false
66 |     for k,_ in pairs(cids) do
67 |         add_to_payload(k, "\n")
68 |         found = true
69 |     end
70 | 
71 |     if found then
72 |         inject_payload("txt", "clients")
73 |         cids = {}
74 |     end
75 | end
76 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/firefox_searches.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Calculates search totals by engine, origin, and country.
  7 | 
  8 | Config:
  9 | 
 10 | *Example Heka Configuration*
 11 | 
 12 | .. code-block:: ini
 13 | 
 14 |     [FirefoxSearches]
 15 |     type = "SandboxFilter"
 16 |     filename = "lua_filters/firefox_searches.lua"
 17 |     message_matcher = "Type == 'telemetry' && Fields[docType] == 'main' && Fields[appName] == 'Firefox' && Fields[appVendor] == 'Mozilla'"
 18 |     ticker_interval = 60
 19 |     output_limit = 512000
 20 |     preserve_data = true
 21 | --]]
 22 | _PRESERVATION_VERSION = 1
 23 | 
 24 | require "cjson"
 25 | require "circular_buffer"
 26 | require "math"
 27 | require "os"
 28 | require "string"
 29 | 
 30 | local ROWS           = 365
 31 | local SEC_PER_ROW    = 60 * 60 * 24
 32 | 
 33 | local origins        = {"abouthome", "contextmenu", "searchbar", "urlbar", "total"}
 34 | local ORIGINS_SIZE   = #origins
 35 | 
 36 | local countries      = {"US", "CN", "RU", "Total"}
 37 | local COUNTRIES_SIZE = #countries
 38 | 
 39 | local function make_cbuf()
 40 |     local cb = circular_buffer.new(ROWS, ORIGINS_SIZE, SEC_PER_ROW, true)
 41 |     for i, v in ipairs(origins) do
 42 |         cb:set_header(i, v)
 43 |     end
 44 |     return cb
 45 | end
 46 | 
 47 | engines = {
 48 |     {name = "Bing"  , cbuf = make_cbuf(), match = "[Bb]ing"},
 49 |     {name = "Google", cbuf = make_cbuf(), match = "[Gg]oogle"},
 50 |     {name = "Yahoo" , cbuf = make_cbuf(), match = "[Yy]ahoo"},
 51 |     {name = "Other" , cbuf = make_cbuf(), match = "."}
 52 | }
 53 | 
 54 | totals = circular_buffer.new(ROWS, #engines * COUNTRIES_SIZE, SEC_PER_ROW, true)
 55 | for i, v in ipairs(engines) do
 56 |     for j, c in ipairs(countries) do
 57 |         totals:set_header((i-1) * COUNTRIES_SIZE + j, string.format("%s_%s", v.name, c))
 58 |     end
 59 | end
 60 | 
 61 | local time = os.time
 62 | function process_message ()
 63 |     local json = read_message("Fields[payload.keyedHistograms]")
 64 |     if not json then return -1, "no keyedHistograms" end
 65 | 
 66 |     local ok, khist = pcall(cjson.decode, json)
 67 |     if not ok then return -1, khist end
 68 |     if type(khist.SEARCH_COUNTS) ~= "table" then return -1, "no SEARCH_COUNTS" end
 69 | 
 70 |     local ts = read_message("Timestamp")
 71 |     for k, v in pairs(khist.SEARCH_COUNTS) do
 72 |         for i, e in ipairs(engines) do
 73 |             if string.match(k, e.match) then
 74 |                 if type(v.sum) ~= "number" then return -1, string.format("missing %s.sum", k) end
 75 |                 local c = v.sum
 76 |                 local cc = read_message("Fields[geoCountry]")
 77 |                 for n = 1, COUNTRIES_SIZE - 1 do
 78 |                     if cc == countries[n] then
 79 |                         totals:add(ts, (i-1) * COUNTRIES_SIZE + n, c)
 80 |                         break
 81 |                     end
 82 |                 end
 83 |                 totals:add(ts, (i-1) * COUNTRIES_SIZE + COUNTRIES_SIZE, c)
 84 | 
 85 |                 for n = 1, ORIGINS_SIZE - 1 do
 86 |                     if string.match(k, origins[n]) then
 87 |                         e.cbuf:add(ts, n, c)
 88 |                         break
 89 |                     end
 90 |                 end
 91 |                 e.cbuf:add(ts, ORIGINS_SIZE, c)
 92 |                 break
 93 |             end
 94 |         end
 95 |     end
 96 |     return 0
 97 | end
 98 | 
 99 | local floor = math.floor
100 | local date  = os.date
101 | local json  = {}
102 | for i=1, ROWS do
103 |     json[i] = {date = "", time_t = 0}
104 |     for m, e in ipairs(engines) do
105 |         local t = {}
106 |         json[i][e.name] = t
107 |         for j, c in ipairs(countries) do
108 |             t[c] = 0
109 |         end
110 |     end
111 | end
112 | 
113 | local title = "Totals"
114 | function timer_event(ns)
115 |     for i, v in ipairs(engines) do
116 |         inject_payload("cbuf", v.name, v.cbuf:format("cbuf"))
117 |         inject_payload("cbufd", v.name, v.cbuf:format("cbufd"))
118 |     end
119 |     inject_payload("cbuf", title, totals:format("cbuf"))
120 |     inject_payload("cbufd", title, totals:format("cbufd"))
121 | 
122 |     local ts = totals:current_time() - (ROWS - 1) * SEC_PER_ROW * 1e9
123 |     for i, v in ipairs(json) do
124 |         v.time_t = floor(ts/1e9)
125 |         v.date   = date("%F", v.time_t)
126 |         for m, e in ipairs(engines) do
127 |             for j, c in ipairs(countries) do
128 |                 local val = totals:get(ts, (m-1) * COUNTRIES_SIZE + j)
129 |                 if val ~= val then val = 0 end
130 |                 v[e.name][c] = val
131 |             end
132 |         end
133 |         ts = ts + SEC_PER_ROW * 1e9
134 |     end
135 |     inject_payload("json", "totals", cjson.encode(json))
136 | end
137 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/firefox_usage.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Firefox Usage Hours
  7 | 
  8 | *Example Heka Configuration*
  9 | 
 10 | .. code-block:: ini
 11 | 
 12 |     [FirefoxUsage]
 13 |     type = "SandboxFilter"
 14 |     filename = "lua_filters/firefox_usage.lua"
 15 |     message_matcher = "Logger == 'fx' && Type == 'executive_summary' && Fields[docType] == 'main' && Fields[vendor] == 'Mozilla' && Fields[app] == 'Firefox'"
 16 |     ticker_interval = 60
 17 |     preserve_data = true
 18 | --]]
 19 | 
 20 | require "circular_buffer"
 21 | require "cjson"
 22 | require "math"
 23 | require "os"
 24 | require "string"
 25 | 
 26 | local DAYS = 30
 27 | local SEC_IN_DAY = 60 * 60 * 24
 28 | local floor = math.floor
 29 | local date = os.date
 30 | 
 31 | day_cb  = circular_buffer.new(DAYS, 1, SEC_IN_DAY, true)
 32 | day_cb:set_header(1, "Active Hours")
 33 | current_day = -1
 34 | 
 35 | local month_names = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug",
 36 |     "Sep", "Oct", "Nov", "Dec"}
 37 | local MONTHS = #month_names
 38 | months = {}
 39 | for i=1,MONTHS do
 40 |     months[i] = 0
 41 | end
 42 | current_month = -1
 43 | 
 44 | local function clear_months(s, n)
 45 |     for i = 1, n do
 46 |         s = s + 1
 47 |         if s > MONTHS then s = 1 end
 48 |         months[s] = 0
 49 |     end
 50 | end
 51 | 
 52 | local function update_month(ts, uptime, day_changed, day_advanced)
 53 |     local month = current_month
 54 |     if current_month == -1 or day_changed then
 55 |         local t = date("*t", ts / 1e9)
 56 |         month = tonumber(t.month)
 57 |         if current_month == -1 then current_month = month end
 58 |     end
 59 | 
 60 |     if day_advanced then
 61 |         local delta = month - current_month
 62 |         if delta > 0 then
 63 |             clear_months(current_month, delta)
 64 |             current_month = month
 65 |         elseif delta < 0 then -- roll over the year
 66 |             clear_months(current_month, MONTHS + delta)
 67 |             current_month = month
 68 |         end
 69 |     end
 70 |     months[month] = months[month] + uptime
 71 | end
 72 | 
 73 | ----
 74 | 
 75 | function process_message()
 76 |     local hours = read_message("Fields[hours]")
 77 |     if type(hours) ~= "number" then
 78 |         return -1, "missing/invalid hours"
 79 |     end
 80 |     if hours == 0 then return 0 end
 81 | 
 82 |     local ts  = read_message("Timestamp")
 83 |     local day = floor(ts / (SEC_IN_DAY * 1e9))
 84 |     local day_changed = day ~= current_day
 85 |     local day_advanced = false
 86 |     if day > current_day then
 87 |         current_day = day
 88 |         day_advanced = true
 89 |     elseif current_day - day > 360 * SEC_IN_DAY then
 90 |         return -1, "data is too old"
 91 |     end
 92 | 
 93 |     day_cb:add(ts, 1, hours)
 94 |     update_month(ts, hours, day_changed, day_advanced)
 95 | 
 96 |     return 0
 97 | end
 98 | 
 99 | local title = "Firefox Daily Active Hours"
100 | function timer_event(ns)
101 |     inject_payload("cbuf", title, day_cb:format("cbuf"))
102 |     inject_payload("cbufd", title, day_cb:format("cbufd"))
103 | 
104 |     local json = {}
105 |     local idx = current_month
106 |     if idx == -1 then idx = 0 end
107 | 
108 |     for i=1,MONTHS do
109 |         idx = idx + 1
110 |         if idx > MONTHS then idx = 1 end
111 |         json[i] = {[month_names[idx]] = months[idx]}
112 |     end
113 |     inject_payload("json", "Firefox Monthly Active Hours", cjson.encode(json))
114 | end
115 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/payload_size.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Extract submission sizes and counts for pipeline messages, emitting small
 7 | derived messages for reporting.
 8 | 
 9 | *Example Heka Configuration*
10 | 
11 | .. code-block:: ini
12 | 
13 |     [PayloadSize]
14 |     type = "SandboxFilter"
15 |     filename = "lua_filters/payload_size.lua"
16 |     message_matcher = "Type == 'telemetry' && Logger == 'telemetry'"
17 |     ticker_interval = 0
18 |     preserve_data = false
19 | 
20 | --]]
21 | 
22 | local msg = {
23 |     Timestamp  = nil,
24 |     Type       = "payload_size",
25 |     Payload    = nil,
26 |     Fields     = {
27 |         build = "",
28 |         channel = "",
29 |         docType = "",
30 |         size = 0,
31 |         submissionDate = "",
32 |     }
33 | }
34 | 
35 | function process_message()
36 |     msg.Timestamp = read_message("Timestamp")
37 |     msg.Fields.build = read_message("Fields[appBuildId]")
38 |     msg.Fields.channel = read_message("Fields[appUpdateChannel]")
39 |     msg.Fields.docType = read_message("Fields[docType]")
40 |     msg.Fields.size = read_message("Fields[Size]")
41 | 
42 |     -- This could be computed from msg.Timestamp, but we need the field for
43 |     -- partitioning the data in the S3 Output.
44 |     msg.Fields.submissionDate = read_message("Fields[submissionDate]")
45 | 
46 |     inject_message(msg)
47 |     return 0
48 | end
49 | 
50 | function timer_event(ns)
51 | 
52 | end
53 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/telemetry_decoder_view.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Creates a summary view of the TelemetryDecoder Statistics.
  7 | 
  8 |     [TelemetryStats]
  9 |     type = "SandboxFilter"
 10 |     message_matcher = "Type == 'telemetry' || Type == 'heka.all-report'"
 11 |     filename = "lua_filters/telemetry_decoder_view.lua"
 12 |     memory_limit = 120000000
 13 |     output_limit = 256000
 14 |     ticker_interval = 60
 15 |     preserve_data = true
 16 | --]]
 17 | 
 18 | require "bloom_filter"
 19 | require "circular_buffer"
 20 | require "cjson"
 21 | require "string"
 22 | local alert         = require "alert"
 23 | 
 24 | local SEC_PER_ROW   = 60
 25 | local ROWS          = 2880
 26 | 
 27 | local items         = read_config("bloom_items") or 3*1e6
 28 | local probability   = read_config("bloom_probability") or 0.01
 29 | local decoder_match = read_config("decoder_match") or "^TelemetryKafkaInput(%d+)"
 30 | bf                  = bloom_filter.new(items, probability)
 31 | cb                  = circular_buffer.new(ROWS, 3, SEC_PER_ROW, true)
 32 | local TOTAL         = cb:set_header(1, "Total")
 33 | local FAILURES      = cb:set_header(2, "Failures")
 34 | local DUPLICATES    = cb:set_header(3, "Duplicates")
 35 | id_count            = {} -- array of decoder ids and the last seen count
 36 | id_failures         = {} -- array of decoder ids and the last seen failure count
 37 | 
 38 | local alert_throttle    = read_config("alert_throttle") or 3600
 39 | alert.set_throttle(alert_throttle * 1e9)
 40 | 
 41 | -- multi-decoder cascade means that we may need to apply a scaling factor to
 42 | -- get an accurate count
 43 | local scaling_factor = read_config("scaling_factor") or 1
 44 | 
 45 | local function update_delta(ts, col, id, parray, cur)
 46 |     local previous = parray[id]
 47 |     if previous then
 48 |         if type(cur) == "number" then
 49 |             if cur > previous then
 50 |                 local delta = cur - previous
 51 |                 parray[id] = cur
 52 |                 cb:add(ts, col, delta)
 53 |             elseif cur < previous then -- system restart
 54 |                 parray[id] = cur
 55 |                 cb:add(ts, col, cur)
 56 |             end
 57 |         end
 58 |     else
 59 |         if type(cur) == "number" then
 60 |             parray[id] = cur
 61 |             cb:set(ts, col, 0/0) -- advance the buffer with a NaN entry
 62 |         end
 63 |     end
 64 | end
 65 | 
 66 | ----
 67 | 
 68 | function process_message ()
 69 |     local typ = read_message("Type")
 70 |     local ts = read_message("Timestamp")
 71 | 
 72 |     if typ == "heka.all-report" then
 73 |         local ok, json = pcall(cjson.decode, read_message("Payload"))
 74 |         if not ok then return -1, json end
 75 | 
 76 |         local t = json.decoders
 77 |         if not t then
 78 |             return -1, "No Decoders found"
 79 |         end
 80 | 
 81 |         for i,v in ipairs(t) do
 82 |             if not v.Name then
 83 |                 return -1, "Decoder is missing its name"
 84 |             end
 85 | 
 86 |             local id = string.match(v.Name, decoder_match)
 87 |             if id then
 88 |                 id = tonumber(id)
 89 | 
 90 |                 if type(v["ProcessMessageCount-TelemetryDecoder"]) == "table" then
 91 |                     update_delta(ts, TOTAL, id, id_count, v["ProcessMessageCount-TelemetryDecoder"].value / scaling_factor)
 92 |                 end
 93 | 
 94 |                 if type(v["ProcessMessageFailures-TelemetryDecoder"]) == "table" then
 95 |                     update_delta(ts, FAILURES, id, id_failures, v["ProcessMessageFailures-TelemetryDecoder"].value)
 96 |                 end
 97 |             end
 98 |         end
 99 |     elseif typ == "telemetry" then
100 |         local did = read_message("Fields[documentId]")
101 |         if not did then
102 |             return -1, "No documentId"
103 |         end
104 | 
105 |         local added = bf:add(did)
106 |         if not added then
107 |             cb:add(ts, DUPLICATES, 1)
108 |         end
109 |     end
110 | 
111 |     return 0
112 | end
113 | 
114 | last_cleared = nil
115 | 
116 | local title = "Telemetry Decoder Statistics"
117 | function timer_event(ns)
118 |     if last_cleared and ns - last_cleared >= 1e9 * ROWS * SEC_PER_ROW then
119 |         bf:clear()
120 |         last_cleared = ns
121 |     elseif not last_cleared then
122 |         last_cleared = ns
123 |     end
124 | 
125 |     if not cb:get(ns, 1) then
126 |         cb:add(ns, 1, 0/0) -- always advance the buffer/graph using a NaN value
127 |     end
128 | 
129 |     local sum, samples = cb:compute("sum", 1, cb:current_time() - (SEC_PER_ROW * 1e9))
130 |     if samples == 0 then
131 |         alert.send(ns, "no new data")
132 |     end
133 |     inject_payload("cbuf", title, cb:format("cbuf"))
134 |     inject_payload("cbufd", title, cb:format("cbufd"))
135 | end
136 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/telemetry_requests.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | _PRESERVATION_VERSION = 1
 5 | 
 6 | require "circular_buffer"
 7 | 
 8 | local title             = "Telemetry Requests"
 9 | local rows              = read_config("rows") or 14400
10 | local sec_per_row       = read_config("sec_per_row") or 60
11 | 
12 | cbuf = circular_buffer.new(rows, 1, sec_per_row, true)
13 | cbuf:set_header(1, "Requests")
14 | 
15 | function process_message ()
16 |     cbuf:add(read_message("Timestamp"), 1, 1)
17 |     return 0
18 | end
19 | 
20 | function timer_event(ns)
21 |     inject_payload("cbuf", title, cbuf:format("cbuf"))
22 |     inject_payload("cbufd", title, cbuf:format("cbufd"))
23 | end
24 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/telemetry_s3output_monitors.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Monitors ProcessFileFailures and ProcessMessageCount in the S3 outputs
  7 | 
  8 | Config:
  9 | 
 10 | *Example Heka Configuration*
 11 | 
 12 | .. code-block:: ini
 13 | 
 14 |     [TelemetryS3OutputMonitors]
 15 |     type = "SandboxFilter"
 16 |     filename = "lua_filters/telemetry_s3output_monitors.lua"
 17 |     ticker_interval = 60
 18 |     preserve_data = false # should always be reset on Heka restarts
 19 |     message_matcher = "Type == 'heka.all-report'"
 20 |     [TelemetryS3OutputMonitors.config]
 21 |     # CSV to ignore low volume streams
 22 |     ignore_stalls = "TelemetryErrorsOutput,TelemetryLoopOutput"
 23 | --]]
 24 | 
 25 | require "cjson"
 26 | require "string"
 27 | local alert = require "alert"
 28 | local l = require "lpeg"
 29 | 
 30 | local sep = l.P(",")
 31 | local elem = l.C((1 - sep)^1)
 32 | local item = elem / l.P
 33 | local list = item * ("," * item)^0
 34 | local function add (a, b) return a + b end
 35 | local grammar = l.Cf(list, add)
 36 | grammar = grammar:match(read_config("ignore_stalls") or "TelemetryErrorsOutput")
 37 | 
 38 | local plugins        = {}
 39 | 
 40 | local function find_plugin(name, ts)
 41 |     local p = plugins[name]
 42 |     if not p then
 43 |         p = {last_alert = 0, last_pff = 0, last_pmc = 0, last_update = ts}
 44 |         plugins[name] = p
 45 |     end
 46 |     return p
 47 | end
 48 | 
 49 | function process_message ()
 50 |     local ok, json = pcall(cjson.decode, read_message("Payload"))
 51 |     if not ok then return -1, json end
 52 |     if type(json.outputs) ~= "table" then return -1, "missing outputs array" end
 53 | 
 54 |     local ts = read_message("Timestamp")
 55 | 
 56 |     for i,v in ipairs(json.outputs) do
 57 |         if type(v) ~= "table" then return -1, "invalid output object" end
 58 |         if type(v.ProcessFileFailures) == "table" then -- confirm this plugin has the S3 instrumentation
 59 |             if not v.Name then return -1, "missing plugin Name" end
 60 | 
 61 |             local p = find_plugin(v.Name, ts)
 62 |             local n = v.ProcessFileFailures.value
 63 |             if type(n) == "number" and n > p.last_pff then
 64 |                 p.msg = string.format("%s ProcessFileFailures has increased to %d", v.Name, n)
 65 |                 p.last_pff = n
 66 |             end
 67 | 
 68 |             if not grammar:match(v.Name) then
 69 |                 n = v.ProcessMessageCount.value
 70 |                 if type(n) == "number" then
 71 |                     if n == p.last_pmc then
 72 |                         if p.last_update + 60 * 1e9 < ts then
 73 |                             p.msg = string.format("%s ProcessMessageCount has stalled at %d", v.Name, n)
 74 |                         end
 75 |                     else
 76 |                         if ts >= p.last_update then
 77 |                             p.last_update = ts
 78 |                             p.last_pmc = n
 79 |                         end
 80 |                     end
 81 |                 end
 82 |             end
 83 |         end
 84 |     end
 85 |     return 0
 86 | end
 87 | 
 88 | function timer_event(ns)
 89 |     for k,v in pairs(plugins) do
 90 |         if v.msg then
 91 |             if ns - v.last_alert > 60 * 60 * 1e9 then -- manual throttling (one alert per plugin per hour)
 92 |                 alert.queue(0, v.msg)
 93 |                 v.last_alert = ns
 94 |             end
 95 |         end
 96 |         v.msg = nil
 97 |     end
 98 |     alert.send_queue(0)
 99 | end
100 | 


--------------------------------------------------------------------------------
/heka/sandbox/filters/telemetry_webrtc.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Derived stream for webrtc. https://bugzilla.mozilla.org/show_bug.cgi?id=1231410
 7 | 
 8 | *Example Heka Configuration*
 9 | 
10 | .. code-block:: ini
11 | 
12 |     [TelemetryWebRTC]
13 |     type = "SandboxFilter"
14 |     filename = "lua_filters/telemetry_webrtc.lua"
15 |     message_matcher = "Type == 'telemetry' && Logger == 'telemetry'"
16 |     ticker_interval = 0
17 |     preserve_data = false
18 | 
19 | --]]
20 | 
21 | require 'cjson'
22 | 
23 | local function check_payload (payload)
24 |     if type(payload) ~= "table" then return false end
25 |     local w = payload["webrtc"] or {}
26 |     local i = w["IceCandidatesStats"] or {}
27 |     if next(i["webrtc"] or {}) or next(i["loop"] or {}) then
28 |         return true
29 |     end
30 |     return false
31 | end
32 | 
33 | function process_message()
34 |     local ok, json = pcall(cjson.decode, read_message("Payload"))
35 |     if not ok then return -1, json end
36 |     local p = json["payload"] or {}
37 |     local found = check_payload(p)
38 |     if not found then
39 |         -- check child payloads for E10s
40 |         local children = read_message("Fields[payload.childPayloads]")
41 |         if not children then return 0 end
42 |         ok, json = pcall(cjson.decode, children)
43 |         if not ok then return -1, children end
44 |         if type(json) ~= "table" then return -1 end
45 |         for i, child in ipairs(json) do
46 |             found = check_payload(child)
47 |             if found then break end
48 |         end
49 |     end
50 | 
51 |     if found then
52 |         local raw = read_message("raw")
53 |         inject_message(raw)
54 |     end
55 |     return 0
56 | end
57 | 
58 | function timer_event(ns)
59 |     -- no op
60 | end
61 | 


--------------------------------------------------------------------------------
/hindsight/analysis/landfill_error.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Simple debug tool to track the types of error in landfill processing. Used
 7 | when tuning the validation schemas.
 8 | 
 9 | Config:
10 | 
11 | filename = "landfill_errors.lua"
12 | message_matcher = "Type == 'telemetry.error'"
13 | --]]
14 | 
15 | require "string"
16 | 
17 | local err_msgs = {}
18 | 
19 | function process_message()
20 |     local de = read_message("Fields[DecodeError]") or "<none>"
21 |     local cnt = err_msgs[de]
22 |     if cnt then
23 |         err_msgs[de] = cnt + 1
24 |     else
25 |         err_msgs[de] = 1
26 |     end
27 |     return 0
28 | end
29 | 
30 | function timer_event(ns, shutdown)
31 |     for k,v in pairs(err_msgs) do
32 |         add_to_payload(v, "\t", k, "\n")
33 |     end
34 |     inject_payload("tsv", "error")
35 | end
36 | 


--------------------------------------------------------------------------------
/hindsight/input/heka_s3.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | -- This Source Code Form is subject to the terms of the Mozilla Public
 6 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 7 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 8 | 
 9 | --[[
10 | ## Reader for the S3 Heka files in compressed or uncompressed form
11 | 
12 | Retrieves/reads each file from the `s3_file_list`. The primary use of this
13 | plugin is to feed the transformed/validated data into analysis plugins.
14 | 
15 | ### Sample Configuration
16 | ```lua
17 | filename        = "heka_s3.lua"
18 | s3_bucket       = "net-mozaws-prod-us-west-2-pipeline-data"
19 | s3_file_list    = "files.ls.1"
20 | tmp_dir         = "/mnt/work/tmp"
21 | ```
22 | --]]
23 | 
24 | require "io"
25 | require "heka_stream_reader"
26 | require "os"
27 | require "string"
28 | 
29 | local tmp_dir       = read_config("tmp_dir")
30 | local s3_bucket     = read_config("s3_bucket") or error("s3_bucket must be set")
31 | local logger        = read_config("Logger")
32 | local s3_file_list  = assert(io.open(read_config("s3_file_list")))
33 | 
34 | 
35 | local function process_file(hsr, fn)
36 |     local fh, err = io.open(fn)
37 |     if not fh then
38 |         print("failed to open", fn)
39 |         return
40 |     end
41 | 
42 |     local found, consumed, read
43 |     repeat
44 |         repeat
45 |             found, consumed, read = hsr:find_message(fh)
46 |             if found then
47 |                 inject_message(hsr)
48 |             end
49 |         until not found
50 |     until read == 0
51 |     fh:close()
52 | end
53 | 
54 | 
55 | local function execute_cmd(cmd, retries)
56 |     local rv = 1
57 |     for i=1, retries do
58 |         rv = os.execute(cmd)
59 |         if rv == 0 then
60 |             break
61 |         end
62 |     end
63 |     return rv
64 | end
65 | 
66 | 
67 | function process_message()
68 |     local hsr  = heka_stream_reader.new("s3")
69 | 
70 |     for fn in s3_file_list:lines() do
71 |         local cmd
72 |         local tfn = string.format("%s/%s", tmp_dir, logger)
73 |         local ext = fn:match("%.([^.]-)$")
74 |         if ext == "zst" then
75 |             cmd = string.format("aws s3 cp s3://%s/%s - | zstd -d -c - > %s", s3_bucket, fn, tfn)
76 |         elseif ext == "gz" then
77 |             cmd = string.format("aws s3 cp s3://%s/%s - | gzip -d -c - > %s", s3_bucket, fn, tfn)
78 |         else
79 |             cmd = string.format("aws s3 cp s3://%s/%s %s", s3_bucket, fn, tfn)
80 |         end
81 | 
82 |         print("processing", cmd)
83 |         local rv = execute_cmd(cmd, 3)
84 |         if rv == 0 then
85 |             process_file(hsr, tfn, compression)
86 |         else
87 |             print("failed to execute rv:", rv, " cmd:", cmd)
88 |         end
89 |     end
90 |     return 0
91 | end
92 | 


--------------------------------------------------------------------------------
/hindsight/input/telemetry_s3_snappy.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | -- This Source Code Form is subject to the terms of the Mozilla Public
  6 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  7 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  8 | 
  9 | --[[
 10 | ## Reader for the S3 telemetry files that are Heka framed, snappy encoded
 11 | messsages
 12 | 
 13 | Retrieves/reads each file from the `s3_file_list`. The primary use of this
 14 | plugin is to feed the transformed/validated data into analysis plugins. Once
 15 | the snappy ugliness is removed (Bugzilla #1250218) the generalized 'heka_s3.lua'
 16 | input can be used instead.
 17 | 
 18 | 
 19 | ### Sample Configuration
 20 | ```lua
 21 | filename        = "telemetry_s3_snappy.lua"
 22 | s3_bucket       = "net-mozaws-prod-us-west-2-pipeline-data"
 23 | s3_file_list    = "telemetry_dims.ls.1"
 24 | tmp_dir         = "/mnt/work/tmp"
 25 | ```
 26 | --]]
 27 | 
 28 | require "io"
 29 | require "heka_stream_reader"
 30 | require "os"
 31 | require "snappy"
 32 | require "string"
 33 | 
 34 | local tmp_dir       = read_config("tmp_dir")
 35 | local s3_bucket     = read_config("s3_bucket") or error("s3_bucket must be set")
 36 | local logger        = read_config("Logger")
 37 | local s3_file_list  = assert(io.open(read_config("s3_file_list")))
 38 | 
 39 | 
 40 | local function snappy_decode(msgbytes)
 41 |     local ok, uc = pcall(snappy.uncompress, msgbytes)
 42 |     if ok then
 43 |         return uc
 44 |     end
 45 |     return msgbytes
 46 | end
 47 | 
 48 | 
 49 | local function process_snappy_ugliness(hsr, dhsr, fh)
 50 |     local found, consumed, read
 51 |     repeat
 52 |         repeat
 53 |             found, consumed, read = hsr:find_message(fh, false) -- don't protobuf decode
 54 |             if found then
 55 |                 local pbm = snappy_decode(hsr:read_message("raw"))
 56 |                 local ok = pcall(dhsr.decode_message, dhsr, pbm)
 57 |                 if ok then
 58 |                     inject_message(dhsr)
 59 |                 end
 60 |             end
 61 |         until not found
 62 |     until read == 0
 63 | end
 64 | 
 65 | 
 66 | local function process_file(hsr, fh)
 67 |     local found, consumed, read
 68 |     repeat
 69 |         repeat
 70 |             found, consumed, read = hsr:find_message(fh)
 71 |             if found then
 72 |                 inject_message(hsr)
 73 |             end
 74 |         until not found
 75 |     until read == 0
 76 | end
 77 | 
 78 | 
 79 | local function execute_cmd(cmd, retries)
 80 |     local rv = 1
 81 |     for i=1, retries do
 82 |         rv = os.execute(cmd)
 83 |         if rv == 0 then
 84 |             break
 85 |         end
 86 |     end
 87 |     return rv
 88 | end
 89 | 
 90 | 
 91 | function process_message()
 92 |     local hsr  = heka_stream_reader.new("s3")
 93 |     local dhsr = heka_stream_reader.new("snappy")
 94 | 
 95 |     for fn in s3_file_list:lines() do
 96 |         local cmd
 97 |         local tfn = string.format("%s/%s", tmp_dir, logger)
 98 |         local ext = fn:match("%.([^.]-)$")
 99 |         if ext == "zst" then
100 |             cmd = string.format("aws s3 cp s3://%s/%s - | zstd -d -c - > %s", s3_bucket, fn, tfn)
101 |         elseif ext == "gz" then
102 |             cmd = string.format("aws s3 cp s3://%s/%s - | gzip -d -c - > %s", s3_bucket, fn, tfn)
103 |         else
104 |             ext = nil
105 |             cmd = string.format("aws s3 cp s3://%s/%s %s", s3_bucket, fn, tfn)
106 |         end
107 | 
108 |         print("processing", cmd)
109 |         local rv = execute_cmd(cmd, 3)
110 |         if rv == 0 then
111 |             local fh, err = io.open(tfn)
112 |             if not fh then
113 |                 print("failed to open", tfn)
114 |                 return 0
115 |             end
116 |             if ext then
117 |                 process_file(hsr, fh)
118 |             else
119 |                 process_snappy_ugliness(hsr, dhsr, fh)
120 |             end
121 |             fh:close()
122 |         else
123 |             print("failed to execute rv:", rv, " cmd:", cmd)
124 |         end
125 |     end
126 |     return 0
127 | end
128 | 


--------------------------------------------------------------------------------
/hindsight/io_modules/derived_stream/heka_protobuf.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | local M = {}
 6 | local ipairs    = ipairs
 7 | local type      = type
 8 | 
 9 | local read_message      = read_message
10 | local encode_message    = encode_message
11 | 
12 | local match = require "string".match
13 | 
14 | setfenv(1, M) -- Remove external access to contain everything in the module
15 | 
16 | function write_message(fh, msg, schema)
17 |     for i,v in ipairs(schema) do
18 |         local value
19 |         if type(v[5]) == "function" then
20 |             value = v[5]()
21 |         elseif type(v[5]) == "string" then
22 |             value = read_message(v[5])
23 |         end
24 | 
25 |         if value ~= nil then
26 |             if v[1] == "Uuid" then
27 |                 msg.Uuid = value
28 |             elseif v[1] == "Timestamp" then
29 |                 msg.Timestamp = value
30 |             elseif v[1] == "Type" then
31 |                 msg.Type = value
32 |             elseif v[1] == "Logger" then
33 |                 msg.Logger = value
34 |             elseif v[1] == "Severity" then
35 |                 msg.Severity = value
36 |             elseif v[1] == "EnvVersion" then
37 |                 msg.EnvVersion = value
38 |             elseif v[1] == "Pid" then
39 |                 msg.Pid = value
40 |             elseif v[1] == "Hostname" then
41 |                 msg.Hostname = value
42 |             else
43 |                 if type(value) == "number" and match(v[2], "INT") then
44 |                     msg.Fields[v[1]] = {value = value, value_type = 2}
45 |                 else
46 |                     msg.Fields[v[1]] = value
47 |                 end
48 |             end
49 |         end
50 |     end
51 |     fh:write(encode_message(msg, true))
52 | end
53 | 
54 | return M
55 | 


--------------------------------------------------------------------------------
/hindsight/io_modules/derived_stream/redshift.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | local M = {}
 6 | local tostring  = tostring
 7 | local type      = type
 8 | 
 9 | local date      = require "os".date
10 | local floor     = require "math".floor
11 | local gsub      = require "string".gsub
12 | 
13 | setfenv(1, M) -- Remove external access to contain everything in the module
14 | 
15 | VARCHAR_MAX_LENGTH = 65535
16 | 
17 | function strip_nonprint(v)
18 |     -- A CHAR column can only contain single-byte characters
19 |     -- http://docs.aws.amazon.com/redshift/latest/dg/r_Character_types.html
20 |     -- for our use restrict it to printable chars
21 |     if v == nil then return end
22 |     if type(v) ~= "string" then v = tostring(v) end
23 |     return gsub(v, "[^\032-\126]", "?")
24 | end
25 | 
26 | function esc_timestamp(v, default)
27 |     if type(v) ~= "number" or v > 4294967296e9 or v < 0 then
28 |         return default
29 |     end
30 |     return date("%Y-%m-%d %H:%M:%S.", floor(v / 1e9)) .. tostring(floor(v % 1e9 / 1e3))
31 | end
32 | 
33 | function esc_smallint(v, default)
34 |     if type(v) ~= "number" or v > 32767 or v < -32767 then
35 |         return default
36 |     end
37 |     return tostring(floor(v))
38 | end
39 | 
40 | function esc_integer(v, default)
41 |     if type(v) ~= "number" or v > 2147483647 or v < -2147483647 then
42 |         return default
43 |     end
44 |     return tostring(floor(v))
45 | end
46 | 
47 | function esc_bigint(v, default)
48 |     if type(v) ~= "number" then return default end
49 |     return tostring(floor(v))
50 | end
51 | 
52 | function esc_double(v, default)
53 |     if type(v) ~= "number"then return default end
54 |     if v ~= v then return "NaN" end
55 |     if v == 1/0 then return "Infinity" end
56 |     if v == -1/0 then return "-Infinity" end
57 |     return tostring(v)
58 | end
59 | 
60 | function esc_boolean(v, default)
61 |     if type(v) ~= "boolean" then return default end
62 |     if v then return "TRUE" end
63 |     return "FALSE"
64 | end
65 | 
66 | return M
67 | 


--------------------------------------------------------------------------------
/hindsight/io_modules/derived_stream/redshift/psv.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | local M = {}
 6 | local ipairs = ipairs
 7 | local read_message = read_message
 8 | local tostring = tostring
 9 | local type = type
10 | 
11 | local rs        = require "derived_stream.redshift"
12 | local string    = require "string"
13 | 
14 | setfenv(1, M) -- Remove external access to contain everything in the module
15 | 
16 | local esc_chars = { ["|"] = "\\|", ["\r"] = "\\r", ["\n"] = "\\n", ["\\"] = "\\\\" }
17 | function esc_varchar(v, max)
18 |     if v == nil then return "" end
19 |     if max == nil then max = rs.VARCHAR_MAX_LENGTH end
20 |     if type(v) ~= "string" then v = tostring(v) end
21 |     if string.len(v) > max then v = string.sub(v, 1, max) end
22 |     local s, e = string.find(v, "%z")
23 |     if s then v = string.sub(v, 1, s-1) end
24 |     return string.gsub(v, "[|\r\n\\]", esc_chars)
25 | end
26 | 
27 | function write_message(fh, schema)
28 |     for i,v in ipairs(schema) do
29 |         local value
30 |         if type(v[5]) == "function" then
31 |             value = v[5]()
32 |         elseif type(v[5]) == "string" then
33 |             value = read_message(v[5])
34 |         end
35 | 
36 |         if v[2] == "TIMESTAMP" then
37 |             value = rs.esc_timestamp(value, "")
38 |         elseif v[2] == "SMALLINT" then
39 |             value = rs.esc_smallint(value, "")
40 |         elseif v[2] == "INTEGER" then
41 |             value = rs.esc_integer(value, "")
42 |         elseif v[2] == "BIGINT" then
43 |             value = rs.esc_bigint(value, "")
44 |         elseif v[2] == "DOUBLE PRECISION" or v[2] == "REAL" or v[2] == "DECIMAL" then
45 |             value = rs.esc_double(value, "")
46 |         elseif v[2] == "BOOLEAN" then
47 |             value = rs.esc_boolean(value, "")
48 |         elseif v[2] == "CHAR" then
49 |             value = esc_varchar(rs.strip_nonprint(value), v[3])
50 |         elseif v[2] == "VARCHAR" or v[2] == "DATE" then
51 |             value = esc_varchar(value, v[3])
52 |         else
53 |             error("Invaild Redshift data type (aliases are not allowed): " .. tostring(v[2]))
54 |         end
55 | 
56 |         if i > 1 then
57 |             fh:write("|", value)
58 |         else
59 |             fh:write(value)
60 |         end
61 |     end
62 |     fh:write("\n")
63 | end
64 | 
65 | return M
66 | 


--------------------------------------------------------------------------------
/hindsight/io_modules/derived_stream/redshift/sql.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | local M = {}
 6 | local error = error
 7 | local ipairs = ipairs
 8 | local read_message = read_message
 9 | local tostring = tostring
10 | local type = type
11 | 
12 | local rs        = require "derived_stream.redshift"
13 | local string    = require "string"
14 | local table     = require "table"
15 | 
16 | setfenv(1, M) -- Remove external access to contain everything in the module
17 | 
18 | function get_create_table_sql(name, schema)
19 |     local pieces = {"CREATE TABLE IF NOT EXISTS ", name, " ("}
20 |     for i, c in ipairs(schema) do
21 |         if i > 1 then
22 |             table.insert(pieces, ",")
23 |         end
24 |         table.insert(pieces, string.format("%s %s", c[1], c[2]))
25 |         if c[3] ~= nil then
26 |             table.insert(pieces, string.format("(%s)", c[3]))
27 |         end
28 |         if c[4]  then
29 |             table.insert(pieces, " " .. c[4])
30 |         end
31 |     end
32 |     table.insert(pieces, ")")
33 |     return table.concat(pieces)
34 | end
35 | 
36 | function esc_timestamp(v)
37 |     local ts = rs.esc_timestamp(v)
38 |     if not ts then return "NULL" end
39 |     return string.format("'%s'", ts)
40 | end
41 | 
42 | function esc_varchar(con, v, max)
43 |     if v == nil then return "NULL" end
44 |     if max == nil then max = rs.VARCHAR_MAX_LENGTH end
45 |     if type(v) ~= "string" then v = tostring(v) end
46 |     if string.len(v) > max then v = string.sub(v, 1, max) end
47 | 
48 |     local escd = con:escape(v)
49 |     if not escd then return "NULL" end
50 |     return string.format("'%s'", escd)
51 | end
52 | 
53 | function write_message(fh, schema, con)
54 |     fh:write("(")
55 |     for i,v in ipairs(schema) do
56 |         local value = "NULL"
57 |         if type(v[5]) == "function" then
58 |             value = v[5]()
59 |         elseif type(v[5]) == "string" then
60 |             value = read_message(v[5])
61 |         end
62 | 
63 |         if v[2] == "TIMESTAMP" then
64 |             value = esc_timestamp(value)
65 |         elseif v[2] == "SMALLINT" then
66 |             value = rs.esc_smallint(value, "NULL")
67 |         elseif v[2] == "INTEGER" then
68 |             value = rs.esc_integer(value, "NULL")
69 |         elseif v[2] == "BIGINT" then
70 |             value = rs.esc_bigint(value, "NULL")
71 |         elseif v[2] == "DOUBLE PRECISION" or v[2] == "REAL" or v[2] == "DECIMAL" then
72 |             value = rs.esc_double(value, "NULL")
73 |         elseif v[2] == "BOOLEAN" then
74 |             value = rs.esc_boolean(value, "NULL")
75 |         elseif v[2] == "CHAR" then
76 |             value = esc_varchar(con, rs.strip_nonprint(value), v[3])
77 |         elseif v[2] == "VARCHAR" or v[2] == "DATE" then
78 |             value = esc_varchar(con, value, v[3])
79 |         else
80 |             error("Invaild Redshift data type (aliases are not allowed): " .. tostring(v[2]))
81 |         end
82 | 
83 |         if i > 1 then
84 |             fh:write(",", value)
85 |         else
86 |             fh:write(value)
87 |         end
88 |     end
89 |     fh:write(")")
90 | end
91 | 
92 | return M
93 | 


--------------------------------------------------------------------------------
/hindsight/io_modules/derived_stream/tsv.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | local M = {}
 6 | local ipairs    = ipairs
 7 | local tostring  = tostring
 8 | local type      = type
 9 | 
10 | local read_message      = read_message
11 | local encode_message    = encode_message
12 | 
13 | local gsub = require "string".gsub
14 | 
15 | setfenv(1, M) -- Remove external access to contain everything in the module
16 | 
17 | local esc_chars = { ["\t"] = "\\t", ["\r"] = "\\r", ["\n"] = "\\n", ["\\"] = "\\\\" }
18 | 
19 | function esc_str(v)
20 |     return gsub(v, "[\t\r\n\\]", esc_chars)
21 | end
22 | 
23 | function write_message(fh, schema, nil_value)
24 |     for i,v in ipairs(schema) do
25 |         local value
26 |         if type(v[5]) == "function" then
27 |             value = v[5]()
28 |         elseif type(v[5]) == "string" then
29 |             value = read_message(v[5])
30 |         end
31 |         if value == nil then
32 |             value = nil_value
33 |         else
34 |             value = tostring(value)
35 |         end
36 | 
37 |         if v[2] == "CHAR" or v[2] == "VARCHAR" then
38 |             value = esc_str(value)
39 |         end
40 | 
41 |         if i > 1 then
42 |             fh:write("\t", value)
43 |         else
44 |             fh:write(value)
45 |         end
46 |     end
47 |     fh:write("\n")
48 | end
49 | 
50 | return M
51 | 


--------------------------------------------------------------------------------
/hindsight/modules/agg.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | local M = {}
 6 | local type = type
 7 | local pairs = pairs
 8 | setfenv(1, M) -- Remove external access to contain everything in the module
 9 | 
10 | -- Merge two objects. Add all data from "src" to "dest". Numeric values are
11 | -- added, boolean and string values are overwritten, and arrays and objects are
12 | -- recursively merged.
13 | -- Any data with different types in dest and src will be skipped.
14 | -- Example:
15 | --local a = {
16 | --    foo = 1,
17 | --    bar = {1, 1, 3},
18 | --    quux = 3
19 | --}
20 | --local b = {
21 | --    foo = 5,
22 | --    bar = {0, 0, 5, 1},
23 | --    baz = {
24 | --        hello = 100
25 | --    }
26 | --}
27 | --
28 | --local c = merge_objects(a, b)
29 | ---------
30 | -- c contains {
31 | --    foo = 5,
32 | --    bar = {1, 1, 8, 1},
33 | --    baz = {
34 | --        hello = 100
35 | --    },
36 | --    quux = 3
37 | --}
38 | function merge_objects(dest, src)
39 |     if dest == nil then
40 |         return src
41 |     end
42 |     if src == nil then
43 |         return dest
44 |     end
45 | 
46 |     local tdest = type(dest)
47 |     local tsrc = type(src)
48 | 
49 |     -- Types are different. Ignore the src value, because src is wrong.
50 |     if tdest ~= tsrc then
51 |         return dest
52 |     end
53 | 
54 |     -- types are the same, neither is nil.
55 |     if tdest == "number" then
56 |         return dest + src
57 |     end
58 | 
59 |     -- most recent wins:
60 |     if tdest == "boolean" or tdest == "string" then
61 |         return src
62 |     end
63 | 
64 |     if tdest == "table" then
65 |         -- array or object, iterate by key
66 |         for k,v in pairs(src) do
67 |             dest[k] = merge_objects(dest[k], v)
68 |         end
69 |         return dest
70 |     end
71 | 
72 |     -- How did we get here?
73 |     --print("weird type: ", tdest, "\n")
74 |     return dest
75 | end
76 | 
77 | return M
78 | 


--------------------------------------------------------------------------------
/hindsight/output/cbuf2tsv.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Convert a circular buffer output to a TSV for non-Heka dashboard consumption
  7 | 
  8 | Config:
  9 |     output_path = path to write the converted cbuf(s) to
 10 | 
 11 | 
 12 | *Example Heka Configuration*
 13 | 
 14 | .. code-block:: ini
 15 | 
 16 |     [CbufToDashboard]
 17 |     type = "SandboxOutput"
 18 |     filename = "cbuf_dashboard.lua"
 19 |     message_matcher = "Type == 'heka.sandbox-output' && Fields[payload_type] == 'cbuf'" # convert all cbufs
 20 |     ticker_interval = 60
 21 | 
 22 |         [CbufToDashboard.config]
 23 |         output_path = "/tmp"
 24 | 
 25 | 
 26 | Input:
 27 | {"time":1423440000,"rows":4,"columns":1,"seconds_per_row":1,"column_info":[{"name":"Active_Users","unit":"count","aggregation":"sum"}]}
 28 | 33031
 29 | 33526
 30 | 40143
 31 | 38518
 32 | 
 33 | Output:
 34 | Time (time_t)    Active Users (count)
 35 | 1423440000  33031
 36 | 1423440001  33526
 37 | 1423440002  40143
 38 | 1423440003  38518
 39 | 
 40 | --]]
 41 | 
 42 | require "cjson"
 43 | require "io"
 44 | require "string"
 45 | require "table"
 46 | 
 47 | local output_path = assert(read_config("output_path"), "output_path must be specified")
 48 | 
 49 | function process_message()
 50 |     local header
 51 |     local cb_time = 0
 52 |     local cb_spr = 0
 53 |     local cb_rows = 0
 54 |     local body = {}
 55 |     local cnt = 0
 56 | 
 57 |     local payload = read_message("Payload")
 58 |     for l in string.gmatch(payload, ".-\n") do
 59 |         if not header then
 60 |             if string.match(l, "^{") then
 61 |                 local ok, json = pcall(cjson.decode, l)
 62 |                 if not ok then return -1, json end
 63 | 
 64 |                 if type(json.time) == "number" and
 65 |                 type(json.rows) == "number" and
 66 |                 type(json.seconds_per_row) == "number" and
 67 |                 type(json.column_info) == "table" then
 68 |                     cb_time = json.time
 69 |                     cb_spr = json.seconds_per_row
 70 |                     cb_rows = json.rows
 71 |                     local names = {"Time (time_t)"}
 72 |                     for i, v in ipairs(json.column_info) do
 73 |                         local ok, col = pcall(string.format, "%s (%s)", v.name, v.unit)
 74 |                         if not ok then return -1, "invalid column_info" end
 75 |                         names[i + 1] = col
 76 |                     end
 77 |                     header = table.concat(names, "\t")
 78 |                 end
 79 |             end
 80 |         else
 81 |             cnt = cnt + 1
 82 |             body[cnt] = string.format("%d\t%s", (cnt - 1) * cb_spr + cb_time, l)
 83 |         end
 84 |     end
 85 | 
 86 |     if not header then return -1, "malformed cbuf, no header" end
 87 | 
 88 |     if cnt < 3 or cnt ~= cb_rows then
 89 |         return -1, string.format("incorrect number of rows expected: %d, received: %d", cb_rows, cnt)
 90 |     end
 91 | 
 92 |     local logger = read_message("Logger")
 93 | 
 94 |     local name = read_message("Fields[payload_name]") or ""
 95 |     name = string.gsub(name, "%W", "")
 96 |     if string.len(name) > 64 then name = string.sub(name, 1, 64) end
 97 | 
 98 |     local fh = assert(io.open(string.format("%s/%s.%s.tsv", output_path, logger, name), "w"))
 99 |     fh:write(header, "\n", table.concat(body))
100 |     fh:close()
101 |     return 0
102 | end
103 | 
104 | function timer_event(ns)
105 |     -- used to force GC
106 | end
107 | 


--------------------------------------------------------------------------------
/hindsight/output/crash_summary.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Outputs a crash ping summary derived stream in the specified format one table/file per day.
 7 | 
 8 | Config:
 9 | 
10 | filename = "crash_summary.lua"
11 | message_matcher = "Type == 'telemetry' && Fields[docType] == 'crash'"
12 | 
13 | format      = "redshift.psv"
14 | buffer_path = "/mnt/output"
15 | buffer_size = 20 * 1024 * 1024
16 | s3_path     = "s3://test"
17 | 
18 | --]]
19 | 
20 | local ds = require "derived_stream"
21 | local fx = require "fx"
22 | local ping = require "fx.ping"
23 | 
24 | local name = "crash_summary"
25 | local schema = {
26 | --  column name                 type            length  attributes  field /function
27 |     {"Timestamp"                ,"TIMESTAMP"    ,nil    ,"SORTKEY"  ,"Timestamp"},
28 |     {"crashDate"                ,"DATE"         ,nil    ,nil        ,function () return ping.get_date(ping.payload().payload.crashDate) end},
29 |     {"clientId"                 ,"CHAR"         ,36     ,"DISTKEY"  ,"Fields[clientId]"},
30 |     {"buildVersion"             ,"VARCHAR"      ,32     ,nil        ,function () return ping.build().version end},
31 |     {"buildId"                  ,"CHAR"         ,14     ,nil        ,function () return ping.build().buildId end},
32 |     {"buildArchitecture"        ,"VARCHAR"      ,32     ,nil        ,function () return ping.build().architecture end},
33 |     {"channel"                  ,"VARCHAR"      ,7      ,nil        ,function () return fx.normalize_channel(read_message("Fields[appUpdateChannel]")) end},
34 |     {"os"                       ,"VARCHAR"      ,7      ,nil        ,function () return fx.normalize_os(read_message("Fields[os]")) end},
35 |     {"osVersion"                ,"VARCHAR"      ,32     ,nil        ,function () return ping.system().os.version end},
36 |     {"osServicepackMajor"       ,"VARCHAR"      ,32     ,nil        ,function () return ping.system().os.servicePackMajor end},
37 |     {"osServicepackMinor"       ,"VARCHAR"      ,32     ,nil        ,function () return ping.system().os.servicePackMinor end},
38 |     {"locale"                   ,"VARCHAR"      ,32     ,nil        ,function () return ping.settings().locale end},
39 |     {"activeExperimentId"       ,"VARCHAR"      ,32     ,nil        ,function () return ping.addons().activeExperiment.id end},
40 |     {"activeExperimentBranch"   ,"VARCHAR"      ,32     ,nil        ,function () return ping.addons().activeExperiment.branch end},
41 |     {"country"                  ,"VARCHAR"      ,5      ,nil        ,function () return fx.normalize_country(read_message("Fields[geoCountry]")) end},
42 |     {"hasCrashEnvironment"      ,"BOOLEAN"      ,nil    ,nil        ,function () return ping.payload().payload.hasCrashEnvironment end},
43 | }
44 | 
45 | local ds_pm
46 | ds_pm, timer_event = ds.load_schema(name, schema)
47 | 
48 | function process_message()
49 |     ping.clear_cache()
50 |     return ds_pm()
51 | end
52 | 
53 | 


--------------------------------------------------------------------------------
/hindsight/output/executive_summary.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Outputs a executive summary based on the main and crash pings as a derived stream
 7 | in the specified format one table/file per day.
 8 | 
 9 | Config:
10 | 
11 | filename = "executive_summary.lua"
12 | message_matcher = "Logger == 'fx' && Type == 'executive_summary'"
13 | 
14 | format      = "redshift.psv"
15 | buffer_path = "/mnt/output"
16 | buffer_size = 20 * 1024 * 1024
17 | s3_path     = "s3://test"
18 | 
19 | --]]
20 | 
21 | local ds = require "derived_stream"
22 | local name = "executive_summary"
23 | local schema = {
24 | --  column name                     type                length  attributes  field /function
25 |     {"Timestamp"                    ,"TIMESTAMP"        ,nil    ,"SORTKEY"  ,"Timestamp"},
26 |     {"activityTimestamp"            ,"TIMESTAMP"        ,nil    ,nil        ,"Fields[activityTimestamp]"},
27 |     {"profileCreationTimestamp"     ,"TIMESTAMP"        ,nil    ,nil        ,"Fields[profileCreationTimestamp]"},
28 |     {"buildId"                      ,"CHAR"             ,14     ,nil        ,"Fields[buildId]"},
29 |     {"clientId"                     ,"CHAR"             ,36     ,"DISTKEY"  ,"Fields[clientId]"},
30 |     {"documentId"                   ,"CHAR"             ,36     ,nil        ,"Fields[documentId]"},
31 |     {"docType"                      ,"CHAR"             ,36     ,nil        ,"Fields[docType]"},
32 |     {"country"                      ,"VARCHAR"          ,5      ,nil        ,"Fields[country]"},
33 |     {"channel"                      ,"VARCHAR"          ,7      ,nil        ,"Fields[channel]"},
34 |     {"os"                           ,"VARCHAR"          ,7      ,nil        ,"Fields[os]"},
35 |     {"osVersion"                    ,"VARCHAR"          ,32     ,nil        ,"Fields[osVersion]"},
36 |     {"app"                          ,"VARCHAR"          ,32     ,nil        ,"Fields[app]"},
37 |     {"version"                      ,"VARCHAR"          ,32     ,nil        ,"Fields[version]"},
38 |     {"vendor"                       ,"VARCHAR"          ,32     ,nil        ,"Fields[vendor]"},
39 |     {"reason"                       ,"VARCHAR"          ,32     ,nil        ,"Fields[reason]"},
40 |     {'"default"'                    ,"BOOLEAN"          ,nil    ,nil        ,"Fields[default]"},
41 |     {"hours"                        ,"DOUBLE PRECISION" ,nil    ,nil        ,"Fields[hours]"},
42 |     {"google"                       ,"INTEGER"          ,nil    ,nil        ,"Fields[google]"},
43 |     {"bing"                         ,"INTEGER"          ,nil    ,nil        ,"Fields[bing]"},
44 |     {"yahoo"                        ,"INTEGER"          ,nil    ,nil        ,"Fields[yahoo]"},
45 |     {"other"                        ,"INTEGER"          ,nil    ,nil        ,"Fields[other]"},
46 |     {"city"                         ,"VARCHAR"          ,32     ,nil        ,"Fields[city]"},
47 | }
48 | 
49 | process_message, timer_event = ds.load_schema(name, schema)
50 | 


--------------------------------------------------------------------------------
/hindsight/output/executive_summary_full.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Outputs a executive summary based on the main and crash pings as a derived stream
  7 | in the specified format one table/file per day.
  8 | 
  9 | Config:
 10 | 
 11 | filename = "executive_summary_full.lua"
 12 | message_matcher = "Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
 13 | 
 14 | format      = "redshift.psv"
 15 | buffer_path = "/mnt/output"
 16 | buffer_size = 20 * 1024 * 1024
 17 | s3_path     = "s3://test"
 18 | 
 19 | --]]
 20 | 
 21 | local ds = require "derived_stream"
 22 | local fx = require "fx"
 23 | local ping = require "fx.ping"
 24 | require "string"
 25 | 
 26 | local doc_type
 27 | local search_counts
 28 | 
 29 | local function get_activity_timestamp()
 30 |     local ts
 31 |     if doc_type == "main" then
 32 |         ts = ping.get_timestamp(ping.info().subsessionStartDate)
 33 |     else
 34 |         ping.get_timestamp(ping.payload().payload.crashDate)
 35 |     end
 36 | 
 37 |     if not ts then
 38 |         ts = read_message("Fields[creationTimestamp]")
 39 |     end
 40 |     return ts
 41 | end
 42 | 
 43 | local function get_search_counts()
 44 |     local cnts = {0, 0, 0, 0}
 45 |     local sc = ping.khist().SEARCH_COUNTS
 46 |     if type(sc) ~= "table" then return cnts end
 47 | 
 48 |     for k, v in pairs(sc) do
 49 |         if type(v) == "table" then
 50 |             for i, e in ipairs({"[Gg]oogle", "[Bb]ing", "[Yy]ahoo", "."}) do
 51 |                 if string.match(k, e) then
 52 |                     if type(v.sum) == "number" then
 53 |                         cnts[i] = cnts[i] + v.sum
 54 |                     end
 55 |                     break
 56 |                 end
 57 |             end
 58 |         end
 59 |     end
 60 |     return cnts
 61 | end
 62 | 
 63 | local name = "executive_summary"
 64 | local schema = {
 65 | --  column name                     type                length  attributes  field /function
 66 |     {"Timestamp"                    ,"TIMESTAMP"        ,nil    ,"SORTKEY"  ,"Timestamp"},
 67 |     {"activityTimestamp"            ,"TIMESTAMP"        ,nil    ,nil        ,get_activity_timestamp},
 68 |     {"profileCreationTimestamp"     ,"TIMESTAMP"        ,nil    ,nil        ,ping.profile_creation_timestamp},
 69 |     {"buildId"                      ,"CHAR"             ,14     ,nil        ,"Fields[appBuildId]"},
 70 |     {"clientId"                     ,"CHAR"             ,36     ,"DISTKEY"  ,"Fields[clientId]"},
 71 |     {"documentId"                   ,"CHAR"             ,36     ,nil        ,"Fields[documentId]"},
 72 |     {"docType"                      ,"CHAR"             ,36     ,nil        ,function () return doc_type end},
 73 |     {"country"                      ,"VARCHAR"          ,5      ,nil        ,function () return fx.normalize_country(read_message("Fields[geoCountry]")) end},
 74 |     {"channel"                      ,"VARCHAR"          ,7      ,nil        ,function () return fx.normalize_channel(read_message("Fields[appUpdateChannel]")) end},
 75 |     {"os"                           ,"VARCHAR"          ,7      ,nil        ,function () return fx.normalize_os(read_message("Fields[os]")) end},
 76 |     {"osVersion"                    ,"VARCHAR"          ,32     ,nil        ,function () return ping.system().os.version end},
 77 |     {"app"                          ,"VARCHAR"          ,32     ,nil        ,"Fields[appName]"},
 78 |     {"version"                      ,"VARCHAR"          ,32     ,nil        ,"Fields[appVersion]"},
 79 |     {"vendor"                       ,"VARCHAR"          ,32     ,nil        ,"Fields[appVendor]"},
 80 |     {"reason"                       ,"VARCHAR"          ,32     ,nil        ,"Fields[reason]"},
 81 |     {'"default"'                    ,"BOOLEAN"          ,nil    ,nil        ,ping.is_default_browser},
 82 |     {"hours"                        ,"DOUBLE PRECISION" ,nil    ,nil        ,ping.hours},
 83 |     {"google"                       ,"INTEGER"          ,nil    ,nil        ,function () return search_counts[1] end},
 84 |     {"bing"                         ,"INTEGER"          ,nil    ,nil        ,function () return search_counts[2] end},
 85 |     {"yahoo"                        ,"INTEGER"          ,nil    ,nil        ,function () return search_counts[3] end},
 86 |     {"other"                        ,"INTEGER"          ,nil    ,nil        ,function () return search_counts[4] end},
 87 |     {"city"                         ,"VARCHAR"          ,32     ,nil        ,"Fields[geoCity]"},
 88 | }
 89 | 
 90 | local ds_pm
 91 | ds_pm, timer_event = ds.load_schema(name, schema)
 92 | 
 93 | function process_message()
 94 |     ping.clear_cache()
 95 |     doc_type = read_message("Fields[docType]")
 96 |     search_counts = get_search_counts()
 97 |     return ds_pm()
 98 | end
 99 | 
100 | 


--------------------------------------------------------------------------------
/hindsight/output/main_summary.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Outputs a main ping summary derived stream in the specified format one table/file per day.
 7 | 
 8 | Config:
 9 | 
10 | filename = "main_summary.lua"
11 | message_matcher = "Type == 'telemetry' && Fields[docType] == 'main'"
12 | 
13 | format      = "redshift.psv"
14 | buffer_path = "/mnt/output"
15 | buffer_size = 100 * 1024 * 1024
16 | s3_path     = "s3://test"
17 | 
18 | --]]
19 | 
20 | local ds = require "derived_stream"
21 | local fx = require "fx"
22 | local ping = require "fx.ping"
23 | 
24 | local name = "main_summary"
25 | local schema = {
26 | --  column name                     type            length  attributes  field /function
27 |     {"Timestamp"                    ,"TIMESTAMP"    ,nil    ,"SORTKEY"  ,"Timestamp"},
28 |     {"subsessionDate"               ,"DATE"         ,nil    ,nil        ,function () return ping.get_date(ping.info().subsessionStartDate) end},
29 |     {"clientId"                     ,"CHAR"         ,36     ,"DISTKEY"  ,"Fields[clientId]"},
30 |     {"buildVersion"                 ,"VARCHAR"      ,32     ,nil        ,function () return ping.build().version end},
31 |     {"buildId"                      ,"CHAR"         ,14     ,nil        ,function () return ping.build().buildId end},
32 |     {"buildArchitecture"            ,"VARCHAR"      ,32     ,nil        ,function () return ping.build().architecture end},
33 |     {"channel"                      ,"VARCHAR"      ,7      ,nil        ,function () return fx.normalize_channel(read_message("Fields[appUpdateChannel]")) end},
34 |     {"os"                           ,"VARCHAR"      ,7      ,nil        ,function () return fx.normalize_os(read_message("Fields[os]")) end},
35 |     {"osVersion"                    ,"VARCHAR"      ,32     ,nil        ,function () return ping.system().os.version end},
36 |     {"osServicepackMajor"           ,"VARCHAR"      ,32     ,nil        ,function () return ping.system().os.servicePackMajor end},
37 |     {"osServicepackMinor"           ,"VARCHAR"      ,32     ,nil        ,function () return ping.system().os.servicePackMinor end},
38 |     {"locale"                       ,"VARCHAR"      ,32     ,nil        ,function () return ping.settings().locale end},
39 |     {"activeExperimentId"           ,"VARCHAR"      ,32     ,nil        ,function () return ping.addons().activeExperiment.id end},
40 |     {"activeExperimentBranch"       ,"VARCHAR"      ,32     ,nil        ,function () return ping.addons().activeExperiment.branch end},
41 |     {"country"                      ,"VARCHAR"      ,5      ,nil        ,function () return fx.normalize_country(read_message("Fields[geoCountry]")) end},
42 |     {"reason"                       ,"VARCHAR"      ,32     ,nil        ,function () return ping.info().reason end},
43 |     {"subsessionLength"             ,"INTEGER"      ,nil    ,nil        ,function () return ping.info().subsessionLength end},
44 |     {"timezoneOffset"               ,"INTEGER"      ,nil    ,nil        ,function () return ping.info().timezoneOffset end},
45 |     {"pluginHangs"                  ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "pluginhang") end},
46 |     {"abortsPlugin"                 ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_ABNORMAL_ABORT", "plugin") end},
47 |     {"abortsContent"                ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_ABNORMAL_ABORT", "content") end},
48 |     {"abortsGmplugin"               ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_ABNORMAL_ABORT", "gmplugin") end},
49 |     {"crashesdetectedPlugin"        ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "plugin") end},
50 |     {"crashesdetectedContent"       ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "content") end},
51 |     {"crashesdetectedGmplugin"      ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("SUBPROCESS_CRASHES_WITH_DUMP", "gmplugin") end},
52 |     {"crashSubmitAttemptMain"       ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_ATTEMPT", "main-crash") end},
53 |     {"crashSubmitAttemptContent"    ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_ATTEMPT", "content-crash") end},
54 |     {"crashSubmitAttemptPlugin"     ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_ATTEMPT", "plugin-crash") end},
55 |     {"crashSubmitSuccessMain"       ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_SUCCESS", "main-crash") end},
56 |     {"crashSubmitSuccessContent"    ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_SUCCESS", "content-crash") end},
57 |     {"crashSubmitSuccessPlugin"     ,"INTEGER"      ,nil    ,nil        ,function () return ping.khist_sum("PROCESS_CRASH_SUBMIT_SUCCESS", "plugin-crash") end},
58 |     {"activeAddons"                 ,"INTEGER"      ,nil    ,nil        ,function () return ping.num_active_addons() end},
59 |     {"flashVersion"                 ,"VARCHAR"      ,16     ,nil        ,function () return ping.flash_version() end},
60 | }
61 | 
62 | local ds_pm
63 | ds_pm, timer_event = ds.load_schema(name, schema)
64 | 
65 | function process_message()
66 |     ping.clear_cache()
67 |     return ds_pm()
68 | end
69 | 
70 | 


--------------------------------------------------------------------------------
/reports/budget/budget.toml:
--------------------------------------------------------------------------------
 1 | [hekad]
 2 | maxprocs = 8
 3 | base_dir = "/mnt/telemetry/output"
 4 | share_dir = "/mnt/telemetry/heka/share/heka"
 5 | # 8MB
 6 | max_message_size = 8388608
 7 | 
 8 | [SnappyDecoder]
 9 | 
10 | [Multi]
11 | type = "MultiDecoder"
12 | subs = ["SnappyDecoder", "ProtobufDecoder"]
13 | cascade_strategy = "all"
14 | log_sub_errors = true
15 | 
16 | [S3Input]
17 | type = "S3SplitFileInput"
18 | s3_bucket = "net-mozaws-prod-us-west-2-pipeline-data"
19 | s3_bucket_prefix = "telemetry-payload-size"
20 | s3_worker_count = 16
21 | s3_read_timeout = 600
22 | schema_file = "schema.json"
23 | decoder = "Multi"
24 | 
25 | [PipelineBudget]
26 | type = "SandboxFilter"
27 | filename = "lua_filters/dollars.lua"
28 | message_matcher = "Logger == 'PayloadSize' && Type == 'heka.sandbox.payload_size'"
29 | output_limit = 0
30 | instruction_limit = 0
31 | memory_limit = 0
32 | ticker_interval = 60
33 | timer_event_on_shutdown = true
34 | preserve_data = true
35 | 
36 |     [PipelineBudget.config]
37 |     max_per_channel = 90
38 | 
39 | [DashboardOutput]
40 | address = ":8080"
41 | static_directory = "/mnt/telemetry/heka/share/heka/dasher"
42 | ticker_interval = 10
43 | 


--------------------------------------------------------------------------------
/reports/budget/check_targets.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | # This Source Code Form is subject to the terms of the Mozilla Public
  5 | # License, v. 2.0. If a copy of the MPL was not distributed with this
  6 | # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  7 | 
  8 | # Check specified submission day, alert if the data volume exceeds target.
  9 | # If we exceed any targets for the day, send an alert email.
 10 | # Targets may be found at
 11 | # s3://net-mozaws-prod-us-west-2-pipeline-metadata/telemetry-2/budget_targets.json
 12 | 
 13 | import sys
 14 | import json
 15 | import argparse
 16 | from boto.ses import connect_to_region as ses_connect
 17 | 
 18 | def pct(actual, expected):
 19 |     return "{:.1%}".format(float(actual) / expected)
 20 | 
 21 | def gb(num_bytes):
 22 |     return "{:.2f}GB".format(float(num_bytes) / 1024.0 / 1024.0 / 1024.0)
 23 | 
 24 | def fmt_err(channel, docType, actual, expected):
 25 |     sign = ">"
 26 |     if actual < expected:
 27 |         sign = "<"
 28 |     return "Channel {}, Type {}: Actual {} {} Expected {} ({})".format(
 29 |         channel, docType, gb(actual), sign, gb(expected), pct(actual, expected))
 30 | 
 31 | def main():
 32 |     parser = argparse.ArgumentParser(description="Check Budget Targets")
 33 |     parser.add_argument("--day", help="Day to check (YYYYMMDD)", required=True)
 34 |     parser.add_argument("--targets-file", help="JSON file containing budget targets", type=file, required=True)
 35 |     parser.add_argument("--data-file", help="JSON file containing observed data", type=file, required=True)
 36 |     parser.add_argument("--from-email", help="Email 'from:' address", required=True)
 37 |     parser.add_argument("--to-email", help="Email 'to:' address (multiple allowed)", action="append", required=True)
 38 |     parser.add_argument("--dry-run", help="Print out what would happen instead of sending alert email", action="store_true")
 39 |     parser.add_argument("--verbose", help="Print all the messages", action="store_true")
 40 |     args = parser.parse_args()
 41 | 
 42 |     target_day = args.day
 43 |     try:
 44 |         targets = json.load(args.targets_file)
 45 |     except Exception as e:
 46 |         print "Error parsing JSON from {}: {}".format(args.targets_file.name, e)
 47 |         return 2
 48 | 
 49 |     try:
 50 |         data = json.load(args.data_file)
 51 |     except Exception as e:
 52 |         print "Error parsing JSON from {}: {}".format(args.data_file.name, e)
 53 |         return 2
 54 | 
 55 |     errors = []
 56 |     exit_code = 0
 57 |     try:
 58 |         s = data["submission"]
 59 |         for c in targets.keys():
 60 |             if c not in s:
 61 |                 if args.verbose:
 62 |                     print "warning: {} not found in data.".format(c)
 63 |                 continue
 64 |             if target_day not in s[c]:
 65 |                 if args.verbose:
 66 |                     print "warning: {}/{} not found in data.".format(c, target_day)
 67 |                 continue
 68 | 
 69 |             scd = s[c][target_day]
 70 |             clients = targets[c]["clients"]
 71 |             for docType in targets[c].keys():
 72 |                 if docType == "clients":
 73 |                     continue
 74 |                 else:
 75 |                     if docType not in scd:
 76 |                         if args.verbose:
 77 |                             print "warning: {}/{}/{} not found in data.".format(c, target_day, docType)
 78 |                         continue
 79 |                     scdt = scd[docType]
 80 |                     expected_size = targets[c][docType]["size"] * targets[c][docType]["count"] * clients
 81 |                     actual_size = scdt["size"]
 82 |                     if actual_size > expected_size:
 83 |                         errors.append(fmt_err(c, docType, actual_size, expected_size))
 84 |                     else:
 85 |                         if args.verbose:
 86 |                             print "ok: {}".format(fmt_err(c, docType, actual_size, expected_size))
 87 |     except Exception as e:
 88 |         print "Data error: {}".format(e)
 89 |         exit_code = 3
 90 | 
 91 |     if len(errors) > 0:
 92 |         message = "Incoming data for {} exceeded budget targets:\n".format(args.day) + "\n".join(sorted(errors))
 93 |         subject = "Incoming Telemetry data exceeded budget targets for {}".format(args.day)
 94 |         if args.dry_run:
 95 |             print "Dry-run mode. Would have sent:"
 96 |             print "=============================="
 97 |             print "   From:", args.from_email
 98 |             print "     To:", args.to_email
 99 |             print "Subject:", subject
100 |             print "   Body:", message
101 |         else:
102 |             # ses = ses_connect('us-east-1')
103 |             ses = ses_connect('us-west-2')
104 |             ses.send_email(
105 |                 source       = args.from_email,
106 |                 subject      = subject,
107 |                 format       = "text",
108 |                 body         = message,
109 |                 to_addresses = args.to_email
110 |             )
111 |     elif args.dry_run:
112 |         print "Dry-run mode, but would not have sent any alerts."
113 | 
114 |     return exit_code
115 | 
116 | if __name__ == "__main__":
117 |     sys.exit(main())
118 | 


--------------------------------------------------------------------------------
/reports/budget/package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | VERSION=0.3
4 | 
5 | tar czvf budget-report-${VERSION}.tar.gz budget.toml run.sh schema_template.json check_targets.py
6 | 


--------------------------------------------------------------------------------
/reports/budget/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | OUTPUT=output
 3 | if [ ! -d "$OUTPUT" ]; then
 4 |     mkdir -p "$OUTPUT/sandbox_preservation"
 5 | fi
 6 | 
 7 | # If we have an argument, process that day.
 8 | TARGET=$1
 9 | if [ -z "$TARGET" ]; then
10 |   # Default to processing "yesterday"
11 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
12 | fi
13 | 
14 | # Install heka
15 | wget http://people.mozilla.org/~mreid/heka-20150918-0_11_0-linux-amd64.tar.gz -O heka.tar.gz
16 | tar xzf heka.tar.gz
17 | mv heka-* heka
18 | 
19 | echo "Fetching previous state..."
20 | 
21 | aws s3 sync s3://telemetry-private-analysis-2/budget-report/data/sandbox_preservation/ "$OUTPUT/sandbox_preservation/"
22 | 
23 | sed -r "s/__TARGET__/$TARGET/" schema_template.json > schema.json
24 | heka/bin/hekad -config budget.toml
25 | 
26 | # Push json to prod report bucket/path
27 | DATA="$OUTPUT/dashboard/data/PipelineBudget.SubmissionSizesbychannelanddate.json"
28 | aws s3 cp "$DATA" s3://net-mozaws-prod-metrics-data/telemetry-budget-dashboard/budget.json --acl bucket-owner-full-control
29 | 
30 | echo "Fetching budget targets"
31 | aws s3 cp s3://net-mozaws-prod-us-west-2-pipeline-metadata/telemetry-2/budget_targets.json ./
32 | 
33 | # Alert if data for $TARGET exceeds expected volume.
34 | ALERT_FROM=telemetry-alerts@mozilla.com
35 | ALERT_TO=$ALERT_FROM
36 | echo "Checking if we've exceeded targets"
37 | python check_targets.py --day $TARGET \
38 |                         --targets-file budget_targets.json \
39 |                         --data-file "$DATA" \
40 |                         --from-email $ALERT_FROM \
41 |                         --to-email $ALERT_TO \
42 |                         --verbose
43 | 


--------------------------------------------------------------------------------
/reports/budget/schema_template.json:
--------------------------------------------------------------------------------
1 | {
2 |   "version": 2,
3 |   "dimensions": [
4 |     { "field_name": "submissionDate", "allowed_values": "__TARGET__" },
5 |     { "field_name": "docType",        "allowed_values": "*" },
6 |     { "field_name": "channel",        "allowed_values": "*" }
7 |   ]
8 | }
9 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/hindsight.cfg:
--------------------------------------------------------------------------------
 1 | output_path             = "output"
 2 | output_size             =  1024 * 1024 * 1024
 3 | sandbox_load_path       = ""
 4 | sandbox_run_path        = "run"
 5 | analysis_threads        = 1
 6 | analysis_lua_path       = "/usr/lib/luasandbox/modules/?.lua;/mnt/telemetry/heka/share/heka/lua_modules/?.lua"
 7 | analysis_lua_cpath      = "/usr/lib/luasandbox/modules/?.so;/mnt/telemetry/heka/share/heka/lua_modules/?.so"
 8 | io_lua_path             = analysis_lua_path ..  ";/usr/lib/luasandbox/io_modules/?.lua;/mnt/telemetry/heka/share/heka/lua_io_modules/?.lua"
 9 | io_lua_cpath            = analysis_lua_cpath .. ";/usr/lib/luasandbox/io_modules/?.so;/mnt/telemetry/heka/share/heka/lua_io_modules/?.so"
10 | max_message_size        = 1024 * 1024 * 1024
11 | backpressure            = 2
12 | 
13 | analysis_defaults = {
14 |     output_limit        = 0,
15 |     memory_limit        = 0,
16 |     instruction_limit   = 0,
17 |     ticker_interval     = 0,
18 |     preserve_data       = false,
19 |     }
20 | 
21 | input_defaults = {
22 |     output_limit        = 1024 * 1024  * 8,
23 |     instruction_limit   = 0,
24 |     preserve_data       = false,
25 |     }
26 | 
27 | output_defaults = {
28 |     output_limit        = 1024 * 1024  * 8,
29 |     ticker_interval     = 0,
30 |     instruction_limit   = 0,
31 |     memory_limit        = 0,
32 |     preserve_data       = false,
33 |     }
34 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | require "io"
 6 | require "heka_stream_reader"
 7 | require "snappy"
 8 | 
 9 | local hsr = heka_stream_reader.new("stdin")
10 | local dhsr = heka_stream_reader.new("snappy")
11 | 
12 | local function snappy_decode(msgbytes)
13 |     local ok, uc = pcall(snappy.uncompress, msgbytes)
14 |     if ok then
15 |         return uc
16 |     end
17 |     return msgbytes
18 | end
19 | 
20 | function process_message()
21 |     local fh = assert(io.popen("cat " .. read_config("list") .. " | ../heka/bin/s3cat  -bucket='net-mozaws-prod-us-west-2-pipeline-data' -stdin=true"))
22 |     local found, consumed, read
23 |     repeat
24 |         repeat
25 |             found, consumed, read = hsr:find_message(fh, false) -- don't protobuf decode
26 |             if found then
27 |                 local pbm = snappy_decode(hsr:read_message("raw"))
28 |                 local ok = pcall(dhsr.decode_message, dhsr, pbm)
29 |                 if ok then
30 |                     inject_message(dhsr)
31 |                 end
32 |             end
33 |         until not found
34 |     until read == 0
35 |     return 0
36 | end
37 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen01.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xaa"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen02.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xab"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen03.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xac"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen04.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xad"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen05.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xae"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen06.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xaf"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen07.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xag"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen08.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xah"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen09.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xai"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen10.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xaj"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen11.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xak"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen12.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xal"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen13.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xam"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen14.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xan"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen15.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xao"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/popen16.cfg:
--------------------------------------------------------------------------------
1 | filename = "popen.lua"
2 | memory_limit = 2000000000
3 | 
4 | list = "xap"
5 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/prune_input.cfg:
--------------------------------------------------------------------------------
1 | filename = "prune_input.lua"
2 | ticker_interval = 60 
3 | 
4 | output_path = "output"
5 | exit_on_stall = true 
6 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/input/prune_input.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Hindsight input log file pruner
 7 | 
 8 | Deletes the log files generated by the input plugins, when all the analysis and
 9 | output plugins are done consumining them (within ticker_interval seconds).
10 | 
11 | *Example Hindsight Configuration*
12 | 
13 | .. code-block:: lua
14 | filename = "prune_input.lua"
15 | ticker_interval = 60
16 | 
17 | output_path   = "output" -- Path to the hindsight.cp file.
18 | exit_on_stall = false -- When true, causes the plugin to stop/abort when the checkpoints are no longer advancing.
19 |                       -- Use this option to allow hindsight_cli to exit when the inputs are finished. This plugin/option
20 |                       -- is typically used when streaming a large data set from something like s3 i.e., running
21 |                       -- a report.
22 | --]]
23 | 
24 | require "io"
25 | require "os"
26 | require "string"
27 | require "math"
28 | local l = require "lpeg"
29 | l.locale(l)
30 | 
31 | local output_path   = read_config("output_path") or error("output_path must be set")
32 | local exit_on_stall = read_config("exit_on_stall")
33 | 
34 | local function get_min(t, i, o)
35 |     if not t.min then t.min = math.huge end
36 |     if i < t.min then
37 |         t.min = i
38 |         t.off = o
39 |      end
40 |     return t
41 | end
42 | 
43 | local pair      = l.P"'" * l.Cg(l.digit^1/tonumber * ":" * l.C(l.digit^1)) * "'"
44 | local ignore    = (l.P(1) - "\n")^0 * "\n"
45 | local line      = l.P"_G['input->" * (l.P(1) - "'")^1 * "']" * l.space^0 * "=" * l.space^0 * pair * l.space^0 + ignore
46 | local grammar   = l.Cf(l.Ct("") * line^1, get_min)
47 | local min, off  = -1, -1
48 | 
49 | function process_message()
50 |     local fh = io.open(output_path .. "/hindsight.cp")
51 |     if not fh then return 0 end -- checkpoint file not available yet
52 | 
53 |     local s = fh:read("*a")
54 |     fh:close()
55 |     if s then
56 |         local t = grammar:match(s)
57 |         if t then
58 |             if min == t.min and off == t.off then
59 |                 if exit_on_stall then
60 |                     error("input has stopped")
61 |                 end
62 |             else
63 |                 off = t.off
64 |                 if min ~= t.min then
65 |                     min = t.min
66 |                     for i = min - 1, 0, -1 do
67 |                         local r = os.remove(string.format("%s/input/%d.log", output_path, i))
68 |                         if not r then break end
69 |                     end
70 |                 end
71 |             end
72 |         end
73 |     end
74 |     return 0
75 | end
76 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary.lua:
--------------------------------------------------------------------------------
1 | ../../../heka/share/heka/lua_outputs/crash_summary.lua


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary01.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid < '\016' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary02.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\016' && Uuid < '\032' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary03.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\032' && Uuid < '\048' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary04.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\048' && Uuid < '\064' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary05.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\064' && Uuid < '\080' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary06.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\080' && Uuid < '\096' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary07.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\096' && Uuid < '\112' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary08.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\112' && Uuid < '\128' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary09.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\128' && Uuid < '\144' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary10.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\144' && Uuid < '\160' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary11.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\160' && Uuid < '\176' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary12.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\176' && Uuid < '\192' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary13.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\192' && Uuid < '\208' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary14.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\208' && Uuid < '\224' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary15.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\224' && Uuid < '\240' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/crash_summary16.cfg:
--------------------------------------------------------------------------------
1 | filename = "crash_summary.lua"
2 | message_matcher = "Uuid > '\240' && Type == 'telemetry' && Fields[docType] == 'crash'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary01.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid < '\016' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary02.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\016' && Uuid < '\032' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary03.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\032' && Uuid < '\048' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary04.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\048' && Uuid < '\064' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary05.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\064' && Uuid < '\080' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary06.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\080' && Uuid < '\096' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary07.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\096' && Uuid < '\112' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary08.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\112' && Uuid < '\128' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary09.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\128' && Uuid < '\144' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary10.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\144' && Uuid < '\160' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary11.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\160' && Uuid < '\176' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary12.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\176' && Uuid < '\192' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary13.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\192' && Uuid < '\208' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary14.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\208' && Uuid < '\224' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary15.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\224' && Uuid < '\240' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary16.cfg:
--------------------------------------------------------------------------------
1 | filename = "executive_summary_full.lua"
2 | message_matcher = "Uuid > '\240' && Type == 'telemetry' && (Fields[docType] == 'main' || Fields[docType] == 'crash')"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/executive_summary_full.lua:
--------------------------------------------------------------------------------
1 | ../../../heka/share/heka/lua_outputs/executive_summary_full.lua


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary.lua:
--------------------------------------------------------------------------------
1 | ../../../heka/share/heka/lua_outputs/main_summary.lua


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary01.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid < '\016' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary02.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\016' && Uuid < '\032' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary03.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\032' && Uuid < '\048' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary04.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\048' && Uuid < '\064' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary05.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\064' && Uuid < '\080' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary06.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\080' && Uuid < '\096' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary07.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\096' && Uuid < '\112' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary08.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\112' && Uuid < '\128' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary09.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\128' && Uuid < '\144' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary10.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\144' && Uuid < '\160' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary11.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\160' && Uuid < '\176' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary12.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\176' && Uuid < '\192' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary13.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\192' && Uuid < '\208' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary14.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\208' && Uuid < '\224' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary15.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\224' && Uuid < '\240' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/run/output/main_summary16.cfg:
--------------------------------------------------------------------------------
1 | filename = "main_summary.lua"
2 | message_matcher = "Uuid > '\240' && Type == 'telemetry' && Fields[docType] == 'main'"
3 | 
4 | format		= "redshift.psv"
5 | buffer_path = "../s3output"
6 | buffer_size = 100 * 1024 * 1024
7 | s3_path		= "s3://telemetry-private-analysis-2/derived_streams/data"
8 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/schema_template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dimensions": [
 4 |     { "field_name": "submissionDate",   "allowed_values": "__TARGET__"},
 5 |     { "field_name": "sourceName",       "allowed_values": "telemetry" },
 6 |     { "field_name": "sourceVersion",    "allowed_values": "4" },
 7 |     { "field_name": "docType",          "allowed_values": ["main", "crash"]},
 8 |     { "field_name": "appName",          "allowed_values": "*" },
 9 |     { "field_name": "appUpdateChannel", "allowed_values": "*" },
10 |     { "field_name": "appVersion",       "allowed_values": "*" },
11 |     { "field_name": "appBuildId",       "allowed_values": "*"}
12 |   ]
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/reports/derived_streams/derived_streams/splitter.lua:
--------------------------------------------------------------------------------
 1 | require "io"
 2 | require "string"
 3 | 
 4 | local PARTITIONS = 16 
 5 | local fhs = {}
 6 | for i=1, PARTITIONS do
 7 |     fhs[i] = assert(io.open(string.format("xa%c", 96 + i ), "w+"))
 8 | end
 9 | local cnt = 0
10 | 
11 | for line in io.lines("list.txt") do
12 |     local idx = cnt % PARTITIONS + 1
13 |     fhs[idx]:write(line, "\n")
14 |     cnt = cnt + 1
15 | end
16 | 
17 | for i=1, PARTITIONS do
18 |     fhs[i]:close()
19 | end
20 | 


--------------------------------------------------------------------------------
/reports/derived_streams/hindsight/bin/hindsight:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/hindsight/bin/hindsight


--------------------------------------------------------------------------------
/reports/derived_streams/hindsight/bin/hindsight_cli:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/hindsight/bin/hindsight_cli


--------------------------------------------------------------------------------
/reports/derived_streams/luasandbox-0.10.2-Linux-core.deb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/luasandbox-0.10.2-Linux-core.deb


--------------------------------------------------------------------------------
/reports/derived_streams/package.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | VERSION=0.11
 4 | 
 5 | # Git doesn't keep empty dirs :(  Ensure that all necessary dirs are present.
 6 | mkdir -p derived_streams/run/analysis
 7 | mkdir -p derived_streams/run/input
 8 | mkdir -p derived_streams/run/output
 9 | 
10 | tar czvf derived_streams-v4-${VERSION}.tar.gz derived_streams hindsight luasandbox-0.10.2-Linux-core.deb run.sh snappy.so
11 | 


--------------------------------------------------------------------------------
/reports/derived_streams/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install dependencies
 4 | sudo apt-get --yes install lua5.1 postgresql-client jq
 5 | sudo dpkg -i luasandbox-0.10.2-Linux-core.deb
 6 | 
 7 | OUTPUT=output
 8 | if [ ! -d "$OUTPUT" ]; then
 9 |     mkdir -p "$OUTPUT"
10 | fi
11 | 
12 | S3OUTPUT=s3output
13 | if [ ! -d "$S3OUTPUT" ]; then
14 |     mkdir -p "$S3OUTPUT"
15 | fi
16 | 
17 | # Install dependencies
18 | wget http://people.mozilla.org/~mtrinkala/heka-20151124-0_11_0-linux-amd64.tar.gz -O heka.tar.gz
19 | tar xzf heka.tar.gz
20 | 
21 | # Rename the dir to make it easier to refer to
22 | mv heka-* heka
23 | cp snappy.so heka/share/heka/lua_modules/
24 | 
25 | cd derived_streams
26 | # If we have an argument, process that day.
27 | TARGET=$1
28 | if [ -z "$TARGET" ]; then
29 |   # Default to processing "yesterday"
30 |   TARGET=$(date -d 'yesterday' +%Y%m%d)
31 | fi
32 | 
33 | # Update schema with target:
34 | sed -r "s/__TARGET__/$TARGET/" schema_template.json > schema.json
35 | 
36 | # Fetch metadata
37 | META=net-mozaws-prod-us-west-2-pipeline-metadata
38 | # Get metadata:
39 | aws s3 cp s3://$META/sources.json ./
40 | 
41 | # Get the Telemetry data location
42 | BUCKET=$(jq -r '.["telemetry"].bucket' < sources.json)
43 | PREFIX=$(jq -r '.["telemetry"].prefix' < sources.json)
44 | 
45 | # Run code:
46 | ../heka/bin/heka-s3list -schema schema.json -bucket="$BUCKET" -bucket-prefix="$PREFIX" > list.txt
47 | lua splitter.lua
48 | ../hindsight/bin/hindsight_cli hindsight.cfg 7
49 | 
50 | RV=$?
51 | 
52 | if [ $RV -ne 0 ]; then
53 |     echo "Hindsight encountered an error, returned a value of $RV. Not proceeding with DB load."
54 |     exit $RV
55 | fi
56 | 
57 | echo "Loading data for $TARGET into Redshift..."
58 | 
59 | ## TODO: We assume these are all in the same database. Should fetch credentials
60 | #        for each table separately.
61 | META_PREFIX=$(jq -r '.["telemetry-executive-summary-db"]["metadata_prefix"]' < sources.json)
62 | # Get read-write credentials:
63 | aws s3 cp s3://$META/$META_PREFIX/write/credentials.json ./
64 | 
65 | DB_HOST=$(jq -r '.host' < credentials.json)
66 | DB_PORT=$(jq -r '.port' < credentials.json)
67 | DB_NAME=$(jq -r '.db_name' < credentials.json)
68 | DB_USER=$(jq -r '.username' < credentials.json)
69 | DB_PASS=$(jq -r '.password' < credentials.json)
70 | 
71 | # Install these credentials for psql to use
72 | #   See http://www.postgresql.org/docs/current/static/libpq-pgpass.html
73 | echo "$DB_HOST:$DB_PORT:$DB_NAME:$DB_USER:$DB_PASS" >> ~/.pgpass
74 | chmod 0600 ~/.pgpass
75 | 
76 | PQ="psql -U $DB_USER -h $DB_HOST -p $DB_PORT $DB_NAME"
77 | 
78 | # Fetch AWS credentials for IAM role
79 | #  See http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html#instance-metadata-security-credentials
80 | IAM_ROLE_NAME=$(curl http://169.254.169.254/latest/meta-data/iam/security-credentials/)
81 | curl http://169.254.169.254/latest/meta-data/iam/security-credentials/${IAM_ROLE_NAME} > aws_creds.json
82 | AWS_KEY=$(jq -r '.AccessKeyId' < aws_creds.json)
83 | AWS_SECRET_KEY=$(jq -r '.SecretAccessKey' < aws_creds.json)
84 | TOKEN=$(jq -r '.Token' < aws_creds.json)
85 | 
86 | # See http://docs.aws.amazon.com/redshift/latest/dg/copy-parameters-credentials.html
87 | CREDS="aws_access_key_id=${AWS_KEY};aws_secret_access_key=${AWS_SECRET_KEY};token=${TOKEN}"
88 | for t in main crash executive; do
89 |     NEW_TABLE="${t}_summary_${TARGET}"
90 |     echo "Copying data for $NEW_TABLE..."
91 |     $PQ -c "CREATE TABLE IF NOT EXISTS $NEW_TABLE (LIKE ${t}_summary including defaults);"
92 |     $PQ -c "COPY $NEW_TABLE FROM 's3://telemetry-private-analysis-2/derived_streams/data/${NEW_TABLE}' CREDENTIALS '$CREDS' ACCEPTANYDATE TRUNCATECOLUMNS ESCAPE ACCEPTINVCHARS as ' ';"
93 |     $PQ -c "GRANT SELECT ON $NEW_TABLE TO read_only;"
94 | done
95 | 


--------------------------------------------------------------------------------
/reports/derived_streams/snappy.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/derived_streams/snappy.so


--------------------------------------------------------------------------------
/reports/engagement_ratio/README.txt:
--------------------------------------------------------------------------------
 1 | To Deploy
 2 | =========
 3 | 
 4 | 1. Log in to Telemetry Self-Serve Data Analysis
 5 | 2. Click 'Schedule a Spark Job'
 6 | 3. Edit or create a job with the following parameters:
 7 |     Job Name:              telemetry-engagement-ratio
 8 |     Notebook or Jar:       Upload MauDau.ipynb
 9 |     Spark Submission Args: N/A
10 |     Cluster Size:          10
11 |     Output Visibility:     Public
12 |     Schedule Frequency:    Daily
13 |     Day of Week:           N/A (Sunday)
14 |     Day of Month:          N/A (1)
15 |     Time of Day (UTC):     4am
16 |     Job Timeout (minutes): 300
17 | 


--------------------------------------------------------------------------------
/reports/executive_summary/README.txt:
--------------------------------------------------------------------------------
 1 | To Deploy
 2 | =========
 3 | 
 4 | 1. Run 'package.sh' to create executive-report-v4-0.X.tar.gz
 5 | 2. Log in to Telemetry Self-Serve Data Analysis
 6 | 3. Click 'Schedule a job'
 7 | 4. Edit or create a job with the following parameters:
 8 |   4a. Weekly:
 9 |     Job Name:              executive-report-weekly
10 |     Code Tarball:          Upload executive-report-v4-0.X.tar.gz
11 |     Execution Commandline: ./run.sh weekly
12 |     Output Directory:      output
13 |     Output Visibility:     Private
14 |     Schedule Frequency:    Weekly
15 |     Day of Week:           Monday
16 |     Day of Month:          n/a (1)
17 |     Time of Day (UTC):     10am
18 |     Job Timeout (minutes): 300
19 |   4b. Monthly:
20 |     Job Name:              executive-report-monthly
21 |     Code Tarball:          Upload executive-report-v4-0.X.tar.gz
22 |     Execution Commandline: ./run.sh monthly
23 |     Output Directory:      output
24 |     Output Visibility:     Private
25 |     Schedule Frequency:    Monthly
26 |     Day of Week:           n/a (Sunday)
27 |     Day of Month:          1 (It will run for the previous month)
28 |     Time of Day (UTC):     10am
29 |     Job Timeout (minutes): 600
30 | 


--------------------------------------------------------------------------------
/reports/executive_summary/package.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | VERSION=0.14
4 | tar czvf executive-report-v4-${VERSION}.tar.gz run.sh run_executive_report.py reformat_v4.py
5 | 


--------------------------------------------------------------------------------
/reports/executive_summary/reformat_v4.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | reformat_v4.py
 3 | --------------
 4 | 
 5 | This script automatically formats any of the v4 rollups generated by Trink
 6 | into more consumable dashboard-friendly formats. It simplifies some fields
 7 | and creates 'all' categories for the facets so the js on the frontend has to
 8 | do only a little bit of processing before everything hits crossfilter.
 9 | 
10 | 
11 | '''
12 | 
13 | import csv
14 | import argparse
15 | import datetime
16 | 
17 | parser = argparse.ArgumentParser(description='Reformats the v4 data')
18 | parser.add_argument('-f', '--file', type=str, help='input file to be converted')
19 | parser.add_argument('-o', '--output', type=str, help='output file')
20 | args = parser.parse_args()
21 | 
22 | INPUT  = args.file
23 | OUTPUT = args.output
24 | 
25 | 
26 | f = open(INPUT, 'r')
27 | r = csv.reader(f)
28 | 
29 | headers = r.next()
30 | COUNTRIES = set(['US','CA','BR','MX','FR','ES','IT','PL','TR','RU','DE','IN','ID','CN','JP','GB'])
31 | OSES = {'WINNT': 'Windows', "Darwin": "Mac", "Linux": "Linux", 'Other':'Other'}
32 | CHANNELS = set(['release', 'beta', 'aurora', 'nightly'])
33 | data_keys = ['actives', 'hours','inactives','new_records', 'five_of_seven',  'total_records', 'crashes', 'default','google', 'bing',  'yahoo',  'other'];
34 | out={}
35 | 
36 | def num(s):
37 |     try:
38 |         return int(s)
39 |     except ValueError:
40 |         return float(s)
41 | total=0
42 | 
43 | for line in r:
44 | 
45 |     line = dict(zip(headers,line))
46 |     if line['date'] < datetime.datetime.now().strftime('%Y-%m-%d'):
47 |         # Don't re-aggregate 'all' lines
48 |         if line['geo'] == 'all' or line['channel'] == 'all' or line['os'] == 'all':
49 |             continue
50 | 
51 |         if line['geo'] not in COUNTRIES: line['geo']='Other'
52 |         if line['channel'] not in CHANNELS: line['channel'] = 'Other'
53 |         for geo in ['all', line['geo']]:
54 |             if geo not in out: out[geo]={}
55 |             for channel in ['all', line['channel']]:
56 |                 if channel not in out[geo]: out[geo][channel]={}
57 |                 for os in ['all', line['os']]:
58 |                     if os not in out[geo][channel]: out[geo][channel][os]={}
59 |                     dt = line['date']
60 |                     if dt not in out[geo][channel][os]: out[geo][channel][os][dt]={}
61 |                     for d in data_keys:
62 |                         if d not in out[geo][channel][os][dt]: out[geo][channel][os][dt][d]=0
63 |                         out[geo][channel][os][dt][d]+=num(line[d])
64 | 
65 | w = csv.writer(open(OUTPUT, 'w'))
66 | w.writerow(headers)
67 | 
68 | for g in out:
69 |     for c in out[g]:
70 |         for o in out[g][c]:
71 |             for dt in out[g][c][o]:
72 |                 data_values = [out[g][c][o][dt][_] for _ in data_keys]
73 |                 w.writerow([g,c,o,dt] + data_values)
74 | 


--------------------------------------------------------------------------------
/reports/executive_summary/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | USAGE="Usage: bash $0 {monthly|weekly} [report_start_yyyymmdd]\nIf not specified, report start defaults to the most recent completed reporting period."
  4 | OUTPUT=output
  5 | if [ ! -d "$OUTPUT" ]; then
  6 |     mkdir -p "$OUTPUT"
  7 | fi
  8 | 
  9 | # First argument is "mode". It is required.
 10 | MODE=$1
 11 | if [ "$MODE" != "weekly" -a "$MODE" != "monthly" ]; then
 12 |     echo "Error: specify 'weekly' or 'monthly' report mode."
 13 |     echo -e $USAGE
 14 |     exit 1
 15 | fi
 16 | 
 17 | # If we have a date argument, use that as the report start day.
 18 | TARGET=$2
 19 | if [ -z "$TARGET" ]; then
 20 |     # Default to processing the most recent completed reporting period.
 21 |     # For a week, that is the period ending on the most recent Saturday (and
 22 |     # starting on the prior Sunday)
 23 |     # For a month, it is the period starting on the first of the previous month.
 24 |     if [ "$MODE" = "weekly" ]; then
 25 |         # The Sunday of the previous complete week
 26 |         TARGET=$(date -d 'last sunday - 1 week' +%Y%m%d)
 27 |     else
 28 |         # The first day of the previous complete month
 29 |         TARGET=$(date -d '1 month ago' +%Y%m01)
 30 |     fi
 31 | fi
 32 | 
 33 | echo "Running $MODE report for period starting on $TARGET"
 34 | 
 35 | # Make sure we have 'jq' and other prereqs
 36 | export DEBIAN_FRONTEND=noninteractive; sudo apt-get --yes --force-yes install jq libpq-dev python-dev
 37 | sudo pip install psycopg2
 38 | 
 39 | # Fetch db connection details
 40 | META=net-mozaws-prod-us-west-2-pipeline-metadata
 41 | # Get metadata:
 42 | aws s3 cp s3://$META/sources.json ./
 43 | RC=$?
 44 | # Check if the copy succeeded. See:
 45 | #   http://docs.aws.amazon.com/cli/latest/topic/return-codes.html
 46 | if [ "$RC" -ne "0" ]; then
 47 |     echo "ERROR $RC fetching data sources."
 48 |     exit 2
 49 | fi
 50 | META_PREFIX=$(jq -r '.["telemetry-executive-summary-db"]["metadata_prefix"]' < sources.json)
 51 | # Get read-only credentials:
 52 | aws s3 cp s3://$META/$META_PREFIX/read/credentials.json ./
 53 | RC=$?
 54 | if [ "$RC" -ne "0" ]; then
 55 |     echo "ERROR $RC fetching read credentials."
 56 |     exit 3
 57 | fi
 58 | 
 59 | DB_HOST=$(jq -r '.host' < credentials.json)
 60 | DB_PORT=$(jq -r '.port' < credentials.json)
 61 | DB_NAME=$(jq -r '.db_name' < credentials.json)
 62 | DB_USER=$(jq -r '.username' < credentials.json)
 63 | DB_PASS=$(jq -r '.password' < credentials.json)
 64 | 
 65 | # Code expects a URL of the form:
 66 | #   postgresql://username:password@hostname:port/dbname
 67 | DB_URL="postgresql://${DB_USER}:${DB_PASS}@${DB_HOST}:${DB_PORT}/${DB_NAME}"
 68 | 
 69 | CURRENT="$OUTPUT/executive_report.${MODE}.${TARGET}.csv"
 70 | time python run_executive_report.py \
 71 |         --verbose \
 72 |         --check-tables \
 73 |         --db-url "$DB_URL" \
 74 |         --report-start $TARGET \
 75 |         --mode $MODE > "$CURRENT"
 76 | 
 77 | RC=$?
 78 | if [ "$RC" -ne "0" ]; then
 79 |     echo "ERROR $RC running report."
 80 |     exit 5
 81 | fi
 82 | 
 83 | OVERALL="v4-${MODE}.csv"
 84 | DASHBOARD_S3="s3://net-mozaws-prod-metrics-data/firefox-dashboard"
 85 | echo "Fetching previous state from $OVERALL..."
 86 | aws s3 cp "$DASHBOARD_S3/$OVERALL" ./
 87 | RC=$?
 88 | 
 89 | if [ -s "$OVERALL" ]; then
 90 |     if  [ "$RC" -eq "0" ]; then
 91 |         echo "Backing up previous state"
 92 |         # If we have an existing file, back it up.
 93 |         cp "$OVERALL" "$OUTPUT/${OVERALL}.pre_${TARGET}"
 94 |         gzip "$OUTPUT/${OVERALL}.pre_${TARGET}"
 95 |         # TODO: Should we grep -v the TARGET date, replacing instead of potentially
 96 |         #       duplicating?
 97 |     else
 98 |         echo "ERROR $RC fetching previous state, aborting."
 99 |         exit 4
100 |     fi
101 | else
102 |     echo "No previous state found, starting fresh"
103 |     # If we don't have a previous state, add the header line from this run.
104 |     head -n 1 "$CURRENT" > "$OVERALL"
105 | fi
106 | 
107 | echo "Checking if the csv header is the same. Diffs:"
108 | HEADER_DIFFS=$(diff <(head -n 1 $OVERALL) <(head -n 1 $CURRENT))
109 | if [ ! -z "$HEADER_DIFFS" ]; then
110 |     echo "WARNING: headers were different.  <<<old  >>>current"
111 |     echo $HEADER_DIFFS
112 | else
113 |     echo "None. Headers match."
114 | fi
115 | 
116 | echo "Appending current data to overall state (minus header)"
117 | tail -n +2 "$CURRENT" >> "$OVERALL"
118 | 
119 | # Run the cleanup script
120 | python reformat_v4.py --file "$OVERALL" --output "$OVERALL"
121 | 
122 | echo "Uploading updated state back to dashboard bucket"
123 | # Upload the state back.
124 | aws s3 cp "$OVERALL" "$DASHBOARD_S3/" --acl bucket-owner-full-control
125 | RC=$?
126 | if [ "$RC" -ne "0" ]; then
127 |     echo "ERROR $RC re-uploading to dashbord bucket ($DASHBOARD_S3)."
128 | fi
129 | 
130 | # Then stick it in the output dir
131 | mv "$OVERALL" "$OUTPUT/"
132 | 
133 | # And finally gzip it.
134 | gzip "$OUTPUT/$OVERALL"
135 | 


--------------------------------------------------------------------------------
/reports/fennec_dashboard/README.txt:
--------------------------------------------------------------------------------
 1 | To Deploy
 2 | =========
 3 | Until Bug 1258685 lands, the notebook will automatically select the operating mode ("weekly" or
 4 | "monthly") based on the notebook file name. For this reason, two different Spark jobs need
 5 | to be scheduled.
 6 | 
 7 | Weekly aggregation
 8 | ------------------
 9 | 
10 | 1. Log in to Telemetry Self-Serve Data Analysis
11 | 2. Click 'Schedule a Spark Job'
12 | 3. Edit or create a job with the following parameters:
13 |     Job Name:              telemetry-fennec-dashboard-weekly
14 |     Notebook or Jar:       summarize_csv_weekly.ipynb
15 |     Spark Submission Args: N/A
16 |     Cluster Size:          5
17 |     Output Visibility:     Private
18 |     Schedule Frequency:    Weekly
19 |     Day of Week:           N/A (Sunday)
20 |     Day of Month:          N/A (1)
21 |     Time of Day (UTC):     4am
22 |     Job Timeout (minutes): 300
23 |     
24 | Monthly aggregation
25 | ------------------
26 | 
27 | 1. Log in to Telemetry Self-Serve Data Analysis
28 | 2. Click 'Schedule a Spark Job'
29 | 3. Edit or create a job with the following parameters:
30 |     Job Name:              telemetry-fennec-dashboard-monthly
31 |     Notebook or Jar:       summarize_csv_monthly.ipynb
32 |     Spark Submission Args: N/A
33 |     Cluster Size:          10
34 |     Output Visibility:     Private
35 |     Schedule Frequency:    Monthly
36 |     Day of Week:           N/A (Sunday)
37 |     Day of Month:          N/A (1)
38 |     Time of Day (UTC):     4am
39 |     Job Timeout (minutes): 300
40 | 


--------------------------------------------------------------------------------
/reports/loop/hindsight.cfg:
--------------------------------------------------------------------------------
 1 | output_path             = "output"
 2 | output_size             =  1024 * 1024 * 1024
 3 | sandbox_load_path       = ""
 4 | sandbox_run_path        = "run"
 5 | analysis_threads        = 1
 6 | analysis_lua_path       = "/usr/lib/luasandbox/modules/?.lua;/mnt/work/heka/share/heka/lua_modules/?.lua"
 7 | analysis_lua_cpath      = "/usr/lib/luasandbox/modules/?.so;/mnt/work/heka/share/heka/lua_modules/?.so"
 8 | io_lua_path             = analysis_lua_path ..  ";/usr/lib/luasandbox/io_modules/?.lua;/mnt/work/heka/share/heka/lua_io_modules/?.lua"
 9 | io_lua_cpath            = analysis_lua_cpath .. ";/usr/lib/luasandbox/io_modules/?.so;/mnt/work/heka/share/heka/lua_io_modules/?.so"
10 | max_message_size        = 8 * 1024 * 1024
11 | backpressure            = 2
12 | 
13 | analysis_defaults = {
14 |     output_limit        = 0,
15 |     memory_limit        = 0,
16 |     instruction_limit   = 0,
17 |     ticker_interval     = 0,
18 |     preserve_data       = false,
19 |     }
20 | 
21 | input_defaults = {
22 |     output_limit        = 8 * 1024 * 1024,
23 |     instruction_limit   = 0,
24 |     preserve_data       = false,
25 |     }
26 | 
27 | output_defaults = {
28 |     output_limit        = 8 * 1024 * 1024,
29 |     ticker_interval     = 0,
30 |     instruction_limit   = 0,
31 |     memory_limit        = 0,
32 |     preserve_data       = false,
33 |     }
34 | 
35 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/hll_check.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | Sanity check to make sure the low loop traffic volumes don't throw off the
 7 | loop hyperloglog results too much.
 8 | --]]
 9 | 
10 | require "hyperloglog"
11 | require "math"
12 | 
13 | local days = {}
14 | 
15 | function process_message()
16 |     local day = math.floor(read_message("Timestamp") / 1e9 / 86400)
17 |     local d = days[day]
18 |     if not d then
19 |         d = {hyperloglog.new(), {}}
20 |         days[day] = d
21 |     end
22 |     local p = read_message("Fields[uid]")
23 |     d[1]:add(p)
24 |     d[2][p] = true
25 |     return 0
26 | end
27 | 
28 | function timer_event(ns, shutdown)
29 |     for k,v in pairs(days) do
30 |         local cnt = v[1]:count()
31 |         local acnt = 0
32 |         for m,n in pairs (v[2]) do
33 |             acnt = acnt + 1
34 |         end
35 |         print(k, "hll", cnt, "actual", acnt, "percentage", cnt/acnt)
36 |     end
37 | end
38 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/hll_check.off:
--------------------------------------------------------------------------------
1 | filename = "hll_check.lua"
2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')"
3 | thread = 0
4 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/retention.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | WARNING THIS PLUGIN EXPECTS THE DATA ORDERED BY DAY WITH NO GAPS.
 7 | --]]
 8 | 
 9 | require "cjson"
10 | require "math"
11 | require "os"
12 | require "table"
13 | 
14 | local DAY_OFFSET  = 4 -- start the week on Sunday and correct for the Unix epoch landing on a Thursday
15 | local SEC_IN_DAY  = 60 * 60 * 24
16 | local SEC_IN_WEEK = SEC_IN_DAY * 7
17 | 
18 | local COHORT    = 1
19 | local DAY       = 2
20 | local uids      = {} -- each key has an array columns: cohort, day, interval flag
21 | 
22 | local interval_days = read_config("interval_days") or error("an interval_days must be configured")
23 | 
24 | function process_message()
25 |     local day       = math.floor(read_message("Timestamp") / 1e9 / SEC_IN_DAY)
26 |     local week      = math.floor((day + DAY_OFFSET) / 7)
27 |     local cohort    = week * SEC_IN_WEEK - (SEC_IN_DAY * DAY_OFFSET)
28 |     local uid       = read_message("Fields[uid]")
29 | 
30 |     local u = uids[uid]
31 |     if not u then
32 |         u = {cohort, day}
33 |         uids[uid] = u
34 |         return 0
35 |     end
36 |     local delta = day - u[DAY]
37 |     if delta <= 0 then return 0 end
38 | 
39 |     local interval = math.floor((delta - 1) / interval_days)
40 |     local cinterval = #u - 2
41 |     if interval == cinterval then
42 |         u[cinterval + 3] = true
43 |     end
44 |     return 0
45 | end
46 | 
47 | 
48 | function timer_event(ns, shutdown)
49 |     local cohorts = {}
50 |     for k, u in pairs(uids) do
51 |         local cohort = u[COHORT]
52 |         local c = cohorts[cohort]
53 |         if not c then
54 |             c = {user_count = 1, intervals = {}}
55 |             cohorts[cohort] = c
56 |         else
57 |             c.user_count = c.user_count + 1
58 |         end
59 |         for i, j in ipairs(u) do
60 |             if i > 2  and j then -- skip the cohort and day entries
61 |                 local value = c.intervals[i - 2]
62 |                 if not value then
63 |                     c.intervals[i - 2] = 1
64 |                 else
65 |                    c.intervals[i - 2] = value + 1
66 |                 end
67 |             end
68 |         end
69 |     end
70 | 
71 |     local json = {interval_days = interval_days, cohorts = {}}
72 |     for k, c in pairs(cohorts) do
73 |         json.cohorts[#json.cohorts + 1] = {cohort = os.date("%Y%m%d", k), cohort_user_count = c.user_count, interval_counts = c.intervals}
74 |     end
75 |     table.sort(json.cohorts, function(t1, t2) return t1.cohort < t2.cohort end)
76 |     inject_payload("json", "retention", cjson.encode(json))
77 | end
78 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/retention_daily.cfg:
--------------------------------------------------------------------------------
1 | filename = "retention.lua"
2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')"
3 | interval_days = 1
4 | thread = 1
5 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/retention_monthly.cfg:
--------------------------------------------------------------------------------
1 | filename = "retention.lua"
2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')"
3 | interval_days = 28
4 | thread = 2
5 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/retention_weekly.cfg:
--------------------------------------------------------------------------------
1 | filename = "retention.lua"
2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')"
3 | interval_days = 7
4 | thread = 3
5 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/xau.cfg:
--------------------------------------------------------------------------------
1 | filename = "xau.lua"
2 | message_matcher = "Logger == 'mozilla-loop-server' && Type == 'request.summary' && Fields[uid] != NIL && Fields[action] == 'join' && (Fields[userType] == 'Unregistered' || Fields[userType] == 'Registered')"
3 | thread = 0
4 | 


--------------------------------------------------------------------------------
/reports/loop/run/analysis/xau.lua:
--------------------------------------------------------------------------------
 1 | -- This Source Code Form is subject to the terms of the Mozilla Public
 2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
 3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
 4 | 
 5 | --[[
 6 | WARNING THIS PLUGIN EXPECTS THE DATA ORDERED BY DAY WITH NO GAPS.
 7 | 
 8 | If it is going to be run for more than this one-off we should make it more robust
 9 | --]]
10 | 
11 | 
12 | require "cjson"
13 | require "hyperloglog"
14 | require "math"
15 | require "os"
16 | require "table"
17 | 
18 | local SEC_IN_DAY  = 60 * 60 * 24
19 | 
20 | local days = {}
21 | local cday = -1
22 | local hll
23 | 
24 | local function compute_range(i, len)
25 |     if i == 1 then return nil end
26 |     local s = i - (len - 1)
27 |     if s < 1 then
28 |         s = 1
29 |     end
30 |     return hyperloglog.count(unpack(days, s, i))
31 | end
32 | 
33 | function process_message()
34 |     local day = math.floor(read_message("Timestamp") / 1e9 / SEC_IN_DAY)
35 |     if cday ~= -1 and (day < cday or day > cday + 1) then
36 |         print("day", os.date("%Y%m%d", day * SEC_IN_DAY) , "cday",  os.date("%Y%m%d", cday * SEC_IN_DAY))
37 |         error("data is out of order or has gaps")
38 |     end
39 | 
40 |     if day ~= cday then
41 |         hll = hyperloglog.new()
42 |         days[#days + 1] = hll
43 |         cday = day
44 |     end
45 |     hll:add(read_message("Fields[uid]"))
46 |     return 0
47 | end
48 | 
49 | function timer_event(ns, shutdown)
50 |     local fday = cday - #days
51 |     local json = {}
52 |     for i, v in ipairs(days) do
53 |         local dau = v:count()
54 |         local wau = compute_range(i, 7) or dau
55 |         local mau = compute_range(i, 28) or dau
56 |         json[#json + 1] = {date = os.date("%Y%m%d", (fday + i) * SEC_IN_DAY), dau = dau, wau = wau, mau = mau}
57 |     end
58 |     table.sort(json, function(t1, t2) return t1.date < t2.date end)
59 |     inject_payload("json", "xau", cjson.encode(json))
60 | end
61 | 


--------------------------------------------------------------------------------
/reports/loop/run/input/server_logs.cfg:
--------------------------------------------------------------------------------
1 | filename    = "server_logs.lua"
2 | start_date  = "2015-11-01"
3 | end_date    = "2016-05-12"
4 | service     = "^loop%-app"
5 | 


--------------------------------------------------------------------------------
/reports/loop/run/input/server_logs.lua:
--------------------------------------------------------------------------------
  1 | -- This Source Code Form is subject to the terms of the Mozilla Public
  2 | -- License, v. 2.0. If a copy of the MPL was not distributed with this
  3 | -- file, You can obtain one at http://mozilla.org/MPL/2.0/.
  4 | 
  5 | --[[
  6 | Reads the files list application server logs from S3 for reporting..
  7 | 
  8 | Config:
  9 | 
 10 | filename    = "server_logs.lua"
 11 | start_date  = "2015-11-01"
 12 | end_date    = "2016-05-11"
 13 | service     = "^loop%-app"
 14 | --]]
 15 | 
 16 | require "heka_stream_reader"
 17 | require "io"
 18 | require "os"
 19 | require "string"
 20 | 
 21 | local date_format   = "^(%d%d%d%d)%-(%d%d)%-(%d%d)$"
 22 | local service       = read_config("service") or "."
 23 | local start_date    = read_config("start_date")
 24 | local end_date      = read_config("end_date")
 25 | 
 26 | local syear, smonth, sday   = start_date:match(date_format)
 27 | start_date                  = os.time({year = syear, month = smonth, day = sday})
 28 | 
 29 | local eyear, emonth, eday   = end_date:match(date_format)
 30 | end_date                    = os.time({year = eyear, month = emonth, day = eday})
 31 | 
 32 | assert(end_date >= start_date, "end_date must be greater than or equal to the start_date")
 33 | local num_months = (eyear * 12 + emonth) - (syear * 12 + smonth)
 34 | 
 35 | local function get_file_list(year, month)
 36 |     local path = string.format("s3://heka-logs/shared/%04d-%02d/", year, month)
 37 |     local list = {}
 38 | 
 39 |     local fh = assert(io.popen(string.format("aws s3 ls %s", path)))
 40 |     for line in fh:lines() do
 41 |         local fn, ds = string.match(line, "^%d%d%d%d%-%d%d%-%d%d%s+%d%d:%d%d:%d%d%s+%d+%s+(.-%-(%d%d%d%d%d%d%d%d)_.+)")
 42 |         if ds then
 43 |             ds = os.time({year = ds:sub(1, 4), month = ds:sub(5, 6), day = ds:sub(7, 8)})
 44 |             if fn and string.match(fn, service) and ds >= start_date and ds <= end_date then
 45 |                 list[#list + 1] = fn
 46 |             end
 47 |         end
 48 |     end
 49 |     fh:close()
 50 |     return path, list
 51 | end
 52 | 
 53 | 
 54 | local msg = {
 55 |     Timestamp = 0,
 56 |     Type = "",
 57 |     Logger = "",
 58 |     Fields = {
 59 |         action   = "",
 60 |         userType = "",
 61 |         uid      = ""
 62 |     }
 63 | }
 64 | 
 65 | function process_message()
 66 |     local year = tonumber(syear)
 67 |     local month = tonumber(smonth)
 68 |     for i=0, num_months do
 69 |         local path, list = get_file_list(year, month)
 70 |         for i,fn in ipairs(list) do
 71 |             local hsr = heka_stream_reader.new(path)
 72 |             print("processing", fn)
 73 |             local fh = assert(io.popen(string.format("aws s3 cp %s%s - | gzip -d -c", path, fn)))
 74 |             local found, consumed, read
 75 |             repeat
 76 |                 repeat
 77 |                     found, consumed, read = hsr:find_message(fh)
 78 |                     if found then
 79 |                         -- inject_message(hsr) -- todo remove loop filtering
 80 | 
 81 |                         -- filtering/data reduction for loop testing
 82 |                         local action   = hsr:read_message("Fields[action]")
 83 |                         local userType = hsr:read_message("Fields[userType]")
 84 |                         local uid      = hsr:read_message("Fields[uid]")
 85 |                         if uid and action == "join" and (userType == "Unregistered" or userType == "Registered")  then
 86 |                             msg.Timestamp       = hsr:read_message("Timestamp")
 87 |                             msg.Type            = hsr:read_message("Type")
 88 |                             msg.Logger          = hsr:read_message("Logger")
 89 |                             msg.Fields.action   = action
 90 |                             msg.Fields.userType = userType
 91 |                             msg.Fields.uid      = uid
 92 |                             inject_message(msg)
 93 |                         end
 94 |                         -- end loop testing
 95 |                     end
 96 |                 until not found
 97 |             until read == 0
 98 |             fh:close()
 99 |         end
100 |         month = month + 1
101 |         if month == 13 then
102 |             month = 1
103 |             year  = year + 1
104 |         end
105 |     end
106 |     return 0
107 | end
108 | 


--------------------------------------------------------------------------------
/reports/loop/run/output/placeholder.off:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mozilla-services/data-pipeline/0c94d328f243338d21bae360547c300ac1b82b12/reports/loop/run/output/placeholder.off


--------------------------------------------------------------------------------
/reports/stability-summary/run.sh:
--------------------------------------------------------------------------------
 1 | set -e
 2 | 
 3 | sudo apt-get -y install postgresql-client python-psycopg2 jq
 4 | sudo easy_install boto3
 5 | 
 6 | # Fetch metadata
 7 | META=net-mozaws-prod-us-west-2-pipeline-metadata
 8 | # Get metadata:
 9 | aws s3 cp s3://$META/sources.json ./
10 | 
11 | JOBNAME=telemetry-executive-summary-db
12 | META_PREFIX=$(jq -r ".[\"${JOBNAME}\"][\"metadata_prefix\"]" < sources.json)
13 | 
14 | aws s3 cp s3://$META/$META_PREFIX/write/credentials.json ./
15 | 
16 | DB_HOST=$(jq -r '.["host"]' < credentials.json)
17 | DB_NAME=$(jq -r '.["db_name"]' < credentials.json)
18 | DB_USER=$(jq -r '.["username"]' < credentials.json)
19 | DB_PW=$(jq -r '.["password"]' < credentials.json)
20 | 
21 | CONNECTION_STRING="host=$DB_HOST dbname=$DB_NAME user=$DB_USER password=$DB_PW"
22 | 
23 | echo "running rollup.py"
24 | python rollup.py -d "$CONNECTION_STRING"
25 | 


--------------------------------------------------------------------------------
/reports/stability-summary/summarize.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | from utils import S3CompressedReader, S3CompressedWriter, HeaderCSVReader
 4 | from collections import defaultdict, Counter
 5 | from itertools import izip, count
 6 | 
 7 | default_bucket = 'telemetry-public-analysis-2'
 8 | 
 9 | prop_list = (
10 |     'abortedsessioncount',
11 |     'subsessionlengths',
12 |     'abortsplugin',
13 |     'abortscontent',
14 |     'abortsgmplugin',
15 |     'crashesdetectedplugin',
16 |     'pluginhangs',
17 |     'crashesdetectedcontent',
18 |     'crashesdetectedgmplugin',
19 |     'crashsubmitattemptmain',
20 |     'crashsubmitattemptcontent',
21 |     'crashsubmitattemptplugin',
22 |     'crashsubmitsuccessmain',
23 |     'crashsubmitsuccesscontent',
24 |     'crashsubmitsuccessplugin')
25 | 
26 | class Counts(object):
27 |     def __init__(self):
28 |         self._counts = [0] * len(prop_list)
29 |         self.crashes = 0
30 | 
31 |     def increment(self, i, v):
32 |         self._counts[i] += v
33 | 
34 |     def final(self, **kwargs):
35 |         d = dict(izip(prop_list, self._counts))
36 |         d.update(kwargs)
37 |         d['crashesdetectedmain'] = self.crashes
38 |         return d
39 | 
40 | def nullint(v):
41 |     if v == '':
42 |         return 0
43 |     return int(v)
44 | 
45 | def summarize(date):
46 |     """
47 |     read the large CSV file produced by rollup.put_counts and
48 |     rollup.put_crashes into a smaller summary JSON format for quick overview
49 |     graphing.
50 |     """
51 | 
52 |     counts = defaultdict(Counts)
53 | 
54 |     counts_path = 'stability-rollups/{year}/{date}-main.csv.gz'.format(
55 |         year=date.year, date=date.strftime('%Y%m%d'))
56 |     csvheaders, reader = HeaderCSVReader(
57 |         S3CompressedReader(default_bucket, counts_path))
58 |     key_indexes = [csvheaders.index(prop)
59 |                    for prop in ('channel', 'buildid', 'os')]
60 |     csv_indexes = [(csvheaders.index(prop), propidx)
61 |                    for propidx, prop in izip(count(), prop_list)]
62 |     for row in reader:
63 |         key = tuple(row[idx] for idx in key_indexes)
64 |         counter = counts[key]
65 |         for csvidx, propidx in csv_indexes:
66 |             counter.increment(propidx, nullint(row[csvidx]))
67 | 
68 |     crashes_path = 'stability-rollups/{year}/{date}-crashes.csv.gz'.format(
69 |         year=date.year, date=date.strftime('%Y%m%d'))
70 |     csvheaders, reader = HeaderCSVReader(
71 |         S3CompressedReader(default_bucket, crashes_path))
72 |     key_indexes = [csvheaders.index(prop)
73 |                    for prop in ('channel', 'buildid', 'os')]
74 |     for row in reader:
75 |         key = tuple(row[idx] for idx in key_indexes)
76 |         counts[key].crashes += nullint(row[-1])
77 | 
78 |     summary_path = 'stability-rollups/{year}/{date}-summary.json.gz'.format(
79 |         year=date.year, date=date.strftime('%Y%m%d'))
80 |     with S3CompressedWriter(default_bucket, summary_path) as fd:
81 |         json.dump([c.final(channel=channel, buildid=buildid, os=os)
82 |                    for (channel, buildid, os), c in counts.iteritems()], fd)
83 | 
84 | if __name__ == '__main__':
85 |     import sys
86 |     from datetime import date, timedelta
87 |     start = date(2015, 11, 5)
88 |     end = date(2015, 11, 30)
89 |     for i in count():
90 |         d = start + timedelta(days=i)
91 |         if d > end:
92 |             break
93 |         summarize(d)
94 | 


--------------------------------------------------------------------------------
/reports/stability-summary/utils.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from gzip import GzipFile
 3 | from cStringIO import StringIO
 4 | import sys
 5 | import csv
 6 | 
 7 | class S3CompressedWriter(object):
 8 |     def __init__(self, bucket, path, mimetype='text/plain'):
 9 |         self.bucket = bucket
10 |         self.path = path
11 |         self.mimetype = mimetype
12 |         self._buffer = None
13 | 
14 |     def __enter__(self):
15 |         self._buffer = StringIO();
16 |         self._writer = GzipFile(mode="wb", fileobj=self._buffer)
17 |         return self._writer
18 | 
19 |     def __exit__(self, exc_type, exc_value, traceback):
20 |         if exc_value is None:
21 |             self._writer.close()
22 |             self._buffer.seek(0)
23 |             s3 = boto3.resource('s3')
24 |             s3.Object(self.bucket, self.path).put(Body=self._buffer, ContentEncoding='gzip', ContentType=self.mimetype)
25 |         self._buffer = None
26 | 
27 |     def __del__(self):
28 |         assert self._buffer is None
29 | 
30 | def S3CompressedReader(bucket, path):
31 |     s3 = boto3.resource('s3')
32 |     r = s3.Object(bucket, path).get()
33 |     body = StringIO(r['Body'].read())
34 |     return GzipFile(mode="rb", fileobj=body)
35 | 
36 | def HeaderCSVReader(fd, *args, **kwargs):
37 |     """
38 |     Read CSV data from `fd`, separating the header list from the data.
39 |     """
40 |     reader = csv.reader(fd, *args, **kwargs)
41 |     header = reader.next()
42 |     return header, reader
43 | 


--------------------------------------------------------------------------------